In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures, 
    SmartCorrelatedSelection)

In [4]:
data = pd.read_csv('../datasets/dataset_1.csv')
data.shape

(50000, 301)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('target', axis=1),
                                                    data['target'],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [6]:
pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('duplicated', DropDuplicateFeatures()),
    ('correlated', SmartCorrelatedSelection(selection_method='variance'))
])

pipe.fit(X_train)

Pipeline(steps=[('constant',
                 DropConstantFeatures(tol=0.998,
                                      variables=['var_1', 'var_2', 'var_3',
                                                 'var_4', 'var_5', 'var_6',
                                                 'var_7', 'var_8', 'var_9',
                                                 'var_10', 'var_11', 'var_12',
                                                 'var_13', 'var_14', 'var_15',
                                                 'var_16', 'var_17', 'var_18',
                                                 'var_19', 'var_20', 'var_21',
                                                 'var_22', 'var_23', 'var_24',
                                                 'var_25', 'var_26', 'var_27',
                                                 'var_28', 'var_29', 'var_30', ...])),
                ('duplicated...
                ('correlated',
                 SmartCorrelatedSelection(selection_method='variance',

In [7]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((35000, 78), (15000, 78))

In [10]:
# scaler 
scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# create a function to build logistic regression and compare performance

logit = LogisticRegression(random_state=44, max_iter=500)
logit.fit(X_train, y_train)
print("Train set")
pred = logit.predict_proba(X_train)
print(f"roc-auc: {roc_auc_score(y_train, pred[:, 1])}")

print("Test set")
pred = logit.predict_proba(X_test)
print(f"roc-auc: {roc_auc_score(y_test, pred[:, 1])}")


Train set
roc-auc: 0.791963857142462
Test set
roc-auc: 0.7886079390512362
