In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from utils import load_data,features_to_drop,numerical_features
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from category_encoders import CatBoostEncoder
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,IsolationForest
from sklearn.metrics import recall_score,precision_recall_curve,precision_score,plot_precision_recall_curve,f1_score
from sklearn.model_selection import cross_val_score,train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import LocalOutlierFactor
from imblearn.ensemble import RUSBoostClassifier,BalancedRandomForestClassifier,EasyEnsembleClassifier
np.random.seed(42)

  from tqdm.autonotebook import tqdm


In [2]:
df = load_data()
df = df.drop(features_to_drop,axis=1)

  0%|          | 0/9 [00:00<?, ?it/s]

In [3]:
label_col = 'label'
cat_features = df.columns.difference(numerical_features+[label_col])

In [4]:
ohe_cols = []
target_encoding_cols = []
for col in cat_features:
    if df[col].nunique()<=10:
        ohe_cols.append(col)
    else:
        target_encoding_cols.append(col)
print(f"ohe cols: {ohe_cols}")
print(f"taget encoding cols: {target_encoding_cols}")

ohe cols: ['feature_16', 'feature_18', 'feature_9']
taget encoding cols: ['feature_13', 'feature_14', 'feature_17', 'feature_20', 'feature_5', 'feature_6', 'feature_7']


In [5]:
X=df.drop('label',axis=1)
y= df['label']

In [6]:
cbe = CatBoostEncoder(return_df=False)
ohe = OneHotEncoder(sparse=True,dtype=np.uint8,handle_unknown='infrequent_if_exist')
scaler = RobustScaler()
column_transformer = ColumnTransformer(transformers=[('cat_enc',cbe,target_encoding_cols),('ohe',ohe,cat_features),
                                                     ('robust_scaler',scaler,numerical_features)]
                                       ,remainder='drop')
column_transformer

# Outlier Detection Pipeline
### Not used due to lack of computing resources

In [10]:
outlier_detector = LocalOutlierFactor()
outlier_detection_pipeline =  Pipeline(steps=[('transformer',column_transformer),('detector',outlier_detector)])
outlier_detection_pipeline

# Random Forest Pipeline

In [7]:
rf_model = LGBMClassifier(class_weight={0:1,1:5},n_jobs=7,n_estimators=100,boosting_type='rf',
                          bagging_freq=1,bagging_fraction=0.7)
rf_pipeline = Pipeline(steps=[('transformer',column_transformer),('model',rf_model)])
rf_pipeline

In [11]:
cross_val_score(rf_pipeline,X=X,y=y,cv=3,scoring='f1',n_jobs=7)

array([0.38046161, 0.3866546 , 0.39961924])

In [8]:
balanced_rf_model = BalancedRandomForestClassifier(n_jobs=7)
balanced_rf_pipeline = Pipeline(steps=[('transformer',column_transformer),('model',balanced_rf_model)])
balanced_rf_pipeline

In [None]:
cross_val_score(balanced_rf_pipeline,X=X,y=y,cv=3,scoring='f1',n_jobs=7)

In [9]:
easy_model = EasyEnsembleClassifier(n_jobs=7)
easy_pipeline = Pipeline(steps=[('transformer',column_transformer),('model',easy_model)])
easy_pipeline

In [None]:
cross_val_score(easy_pipeline,X=X,y=y,cv=3,scoring='f1',n_jobs=7)