In [5]:
import pandas as pd
import sweetviz as sv
from pycaret.classification import *
import featuretools as ft
from sympy import sympify


def read_dataset():
    data = pd.read_csv('../data/output_set7.csv', sep=',')
    labels = pd.DataFrame()
    records = data["ViolatedRule"].str.split(",", expand = True)
    for i in range(records.shape[1]):
        
        labels[f"ViolatedRule{i}"] = records[i]
        print(labels[f"ViolatedRule{i}"].unique())
        labels.loc[labels[f"ViolatedRule{i}"] != "-1", f"ViolatedRule{i}"] = 1
        labels.loc[labels[f"ViolatedRule{i}"] == "-1", f"ViolatedRule{i}"] = 0


    return data, labels


def eda(data):
    # skip=["proline", "magnesium"],
    config = sv.FeatureConfig(force_num=['ViolatedRule'])
    my_report = sv.analyze(data, feat_cfg=config, target_feat='ViolatedRule')
    my_report.show_html()
    # profile = ProfileReport(data, title="Pandas Profiling Report")
    # profile.to_file(output_file=pathlib.Path("./data_report.html"))
    # profile.to_widgets()


def stack():
    top3 = compare_models(n_select=3)
    tuned_top3 = [tune_model(i) for i in top3]
    blender = blend_models(tuned_top3)
    stacker = stack_models(tuned_top3)
    best_auc_model = automl(optimize='AUC')
    print(best_auc_model)


def multi_level_feature_creation(X, trans_primitives_per_level): 
    feature_matrix = X
    initial_columns = X.columns

    
    for i,trans_primitives in enumerate(trans_primitives_per_level):
        print("Level: ", i)
        
        print("Columns: ", feature_matrix.columns)
        print(trans_primitives)
    
        es = ft.EntitySet(id = 'dataset')
    
        dataframe_name = "data" + str(i)
    
        es = es.add_dataframe(
            dataframe_name=dataframe_name,
            dataframe=feature_matrix,
            index="index" + str(i)
        )        

        feature_matrix, feature_defs = ft.dfs(entityset = es, target_dataframe_name = dataframe_name, trans_primitives = trans_primitives)
        
        #Simplify
        print(feature_matrix.shape)
        feature_matrix = feature_matrix.rename(columns={
            c: str(sympify(c)) for c in feature_matrix.columns
        })
        
        # This is needed to reset ww data 
        feature_matrix = pd.DataFrame(feature_matrix.to_dict())

    feature_matrix.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    return feature_matrix, feature_defs

def static_features(features):
    features['left_diff'] = abs(features['LeftSideFirst'] - features['LeftSideLast'])
    features['right_diff'] = abs(features['RightSideFirst'] - features['RightSideFirst'])
    features['left_dens'] = features['left_diff'] / features['Length']
    features['right_dens'] = features['right_diff'] / features['Length']

    return features


def clean_dataset(features):
    
    features['LeftSideFirst'] = features['LeftSideFirst'].apply(lambda value: ''.join(i for i in str(value) if i.isdigit()))
    features['LeftSideLast'] = features['LeftSideLast'].apply(lambda value: ''.join(i for i in str(value) if i.isdigit()))
    features['RightSideFirst'] = features['RightSideFirst'].apply(lambda value: ''.join(i for i in str(value) if i.isdigit()))
    features['RightSideLast'] = features['RightSideLast'].apply(lambda value: ''.join(i for i in str(value) if i.isdigit()))
    features = features.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace('', np.nan)
    types_dict = {}
    for column in features.columns.values:
        types_dict[column] = 'float'

    features = features.astype(types_dict)
    return features

def eval_model(features,labels, column):
    print(column)
    #print(labels[column].value_counts())
    dataset = pd.concat([features,labels[column]], axis=1)
    s = setup(dataset, target=column, silent=True, log_experiment=False, experiment_name='first_rule', fix_imbalance=True)
    # best = compare_models()
    # results = pull()
    # print(results.head())
    investigated = create_model('xgboost')

    #models()
    
    #deep_check(investigated)
    tuned_investigated = tune_model(investigated, choose_better=True, n_iter = 10, search_library = 'optuna', search_algorithm = 'tpe')
    tuned_investigated
    plot_model(tuned_investigated, plot='pr')
    plot_model(investigated, plot='feature')
    plot_model(investigated, plot='confusion_matrix')
    
    evaluate_model(tuned_investigated)
    predict_model(tuned_investigated)
    final_rf = finalize_model(tuned_investigated)

In [6]:
drop_columns=['txId', 'FeatureID', 'ViolatedRule', 'LeftSideIntermediate', 'RightSideIntermediate']
catgories_columns=['HouseNumberVariance', 'LeftSideInterpolation', 'RightSideInterpolation','fow']
features, labels = read_dataset()

features = features.drop(columns=drop_columns)
categories_features = features[catgories_columns]
features = features.drop(columns=catgories_columns)

features = clean_dataset(features)
features.info()
features


['-1' '50901']
['-1' '50890']
['-1' '50897']
['-1' '51516']
['-1' '50905']
['53021' '-1']
['-1' '50883']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   LeftSideFirst   3783 non-null   float64
 1   LeftSideLast    3783 non-null   float64
 2   RightSideFirst  3879 non-null   float64
 3   RightSideLast   3879 non-null   float64
 4   Length          4999 non-null   float64
dtypes: float64(5)
memory usage: 195.4 KB


Unnamed: 0,LeftSideFirst,LeftSideLast,RightSideFirst,RightSideLast,Length
0,,,1.0,4341.0,12473.524269
1,2.0,2.0,,,5454.575052
2,5.0,5.0,12.0,12.0,10279.898141
3,8.0,8.0,36.0,25.0,5882.996800
4,3.0,3.0,,,22700.651155
...,...,...,...,...,...
4994,599.0,401.0,15866.0,400.0,19092.205927
4995,,,169.0,19.0,1063.656429
4996,300.0,300.0,,,13637.443042
4997,881001.0,881001.0,880904.0,880902.0,3925.939702


In [7]:
features_per_level = [['subtract_numeric', 'add_numeric', 'multiply_numeric', 'divide_numeric'], 
                      ['absolute'], 
                      ['divide_numeric']] #         ['add_numeric', 'multiply_numeric', 'subtract_numeric', 'divide_numeric', 'multiply_numeric_scalar']
features, _ = multi_level_feature_creation(features, features_per_level)
#features = static_features(features)

features = pd.concat([features, categories_features], axis=1)

#features.to_csv('../data/output_6K_5R_HNR_set6_out.csv')
# eda(data)

# print(features.info())
# print(features.describe().transpose())

Level:  0
Columns:  Index(['LeftSideFirst', 'LeftSideLast', 'RightSideFirst', 'RightSideLast',
       'Length'],
      dtype='object')
['subtract_numeric', 'add_numeric', 'multiply_numeric', 'divide_numeric']
(4999, 55)
Level:  1
Columns:  Index(['LeftSideFirst', 'LeftSideLast', 'RightSideFirst', 'RightSideLast',
       'Length', 'LeftSideFirst + LeftSideLast', 'LeftSideFirst + Length',
       'LeftSideFirst + RightSideFirst', 'LeftSideFirst + RightSideLast',
       'LeftSideLast + Length', 'LeftSideLast + RightSideFirst',
       'LeftSideLast + RightSideLast', 'Length + RightSideFirst',
       'Length + RightSideLast', 'RightSideFirst + RightSideLast',
       'LeftSideFirst/LeftSideLast', 'LeftSideFirst/Length',
       'LeftSideFirst/RightSideFirst', 'LeftSideFirst/RightSideLast',
       'LeftSideLast/LeftSideFirst', 'LeftSideLast/Length',
       'LeftSideLast/RightSideFirst', 'LeftSideLast/RightSideLast',
       'Length/LeftSideFirst', 'Length/LeftSideLast', 'Length/RightSideFirst',


In [None]:
for index, column in enumerate(labels.columns.values):
    zeros = labels[labels[f"ViolatedRule{index}"] == 0].shape[0]
    ones = labels[labels[f"ViolatedRule{index}"] == 1].shape[0]
    print(f'{zeros}, {ones} {zeros + ones}') 



In [28]:
# models generator per Rule
# because results are overwritten index == 0 
for index, column in enumerate(labels.columns.values):
    if column.startswith('ViolatedRule') and index==0:
        eval_model(features,labels,column)

In [None]:
eval_model(features,labels,'ViolatedRule1')

In [None]:
eval_model(features,labels,'ViolatedRule2')

In [None]:
eval_model(features,labels,'ViolatedRule3')

In [None]:
eval_model(features,labels,'ViolatedRule4')

In [None]:
eval_model(features,labels,'ViolatedRule5')

In [None]:
eval_model(features,labels,'ViolatedRule6')