In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

In [3]:
X_train = pd.read_csv('/Users/patrycjapiechowicz/CYBER/xtrain.csv')
X_test = pd.read_csv('/Users/patrycjapiechowicz/CYBER/xtest.csv')

In [4]:
# I keep a copy of the dataset with all the variables
# to measure the performance of machine learning models

X_train_original = X_train.copy()
X_test_original = X_test.copy()

In [5]:
y_train=X_train.iloc[:,0]
y_test=X_test.iloc[:,0]
X_train=X_train.iloc[:,1:]
X_test=X_test.iloc[:,1:]

In [6]:
y_train.shape,y_test.shape, X_train.shape, X_test.shape

((5400,), (600,), (5400, 178), (600, 178))

#### remove quasi-constant features

In [7]:
# remove quasi-constant features
sel = VarianceThreshold(
    threshold=0.001)  # 0.01 indicates 99,9% of observations approximately

sel.fit(X_train)  # fit finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

89

In [8]:
features_to_keep = X_train.columns[sel.get_support()]

In [9]:
# we can then remove the features:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((5400, 89), (600, 89))

In [10]:
# I keep a copy of the dataset except constant and duplicated variables
# to measure the performance of machine learning model

X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

In [11]:
# here I transform the arrays back to dataframes

X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep

X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

In [12]:
X_train_basic_filter.shape

(5400, 89)

#### remove correlated features

In [13]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.9)
print('correlated features: ', len(set(corr_features)) )

correlated features:  9


In [14]:
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((5400, 80), (600, 80))

In [15]:
# create a function to build random forests and compare performance in train and test set

def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [16]:
# original
run_randomForests(X_train_original.iloc[:,1:],
                  X_test_original.iloc[:,1:],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.940740507534217
Test set
Random Forests roc-auc: 0.9354705665971843


In [17]:
# filter methods - remove quasi
run_randomForests(X_train_basic_filter,
                  X_test_basic_filter,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9367704408261768
Test set
Random Forests roc-auc: 0.9307108381357916


In [18]:
# filter methods - correlation
run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9369557028323343
Test set
Random Forests roc-auc: 0.9309226293319659


We can see that removing correlated feature improve performence of the model

#### check the same for logistic regression

In [19]:
# create a function to build logistic regression and compare performance in train and test set

def run_logistic(X_train, X_test, y_train, y_test):
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [20]:
# original

run_logistic(X_train_original.iloc[:,1:],
                  X_test_original.iloc[:,1:],
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.9148846944637868
Test set
Logistic Regression roc-auc: 0.908561937777976


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
# filter methods - remove quasi

run_logistic(X_train_basic_filter,
             X_test_basic_filter,
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.9117055408223904
Test set
Logistic Regression roc-auc: 0.9048500183923934


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
# filter methods - correlation
run_logistic(X_train,
             X_test,
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.9111361053632563
Test set
Logistic Regression roc-auc: 0.9026763719053403


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
X_train_basic_filter =pd.DataFrame(X_train_basic_filter)

In [24]:
selected_feats=X_train_basic_filter.columns

In [25]:
len(selected_feats)

89

In [26]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)

In [27]:
X_train

Unnamed: 0,1,2,3,4,5,6,7,8,9,11,...,161,163,168,170,171,173,174,176,177,178
0,0.0,0.105263,0.000000,0.000000,0.326206,0.333333,0.0,0.0,0.0,0.363014,...,0.000010,0.25,0.0,0.000000,0.000000,0.000000,0.000000,0.000821,0.033743,0.0
1,0.0,0.105263,0.222222,0.666667,0.877500,0.000000,1.0,1.0,1.0,0.308515,...,0.000000,0.25,0.0,0.000000,0.000000,0.000000,0.000000,0.004024,0.001151,0.0
2,1.0,0.263158,0.333333,0.500000,0.884415,0.833333,1.0,1.0,1.0,0.354475,...,0.000000,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.024283,0.004260,0.0
3,1.0,0.263158,0.333333,0.500000,0.884415,0.833333,1.0,1.0,1.0,0.354475,...,0.000000,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.024448,0.004260,0.0
4,1.0,0.210526,0.333333,0.600000,0.885643,0.800000,1.0,1.0,1.0,0.332880,...,0.000000,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.012484,0.038461,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5395,0.0,0.105263,0.111111,0.333333,0.810691,0.000000,1.0,1.0,1.0,0.273060,...,0.000000,0.00,0.0,0.000000,0.707071,0.000000,0.000000,0.006215,0.005270,0.0
5396,1.0,0.105263,0.111111,0.333333,0.888834,0.666667,1.0,1.0,0.0,0.317353,...,0.000190,0.00,0.0,0.000000,0.000000,0.069603,0.000278,0.025953,0.000577,0.0
5397,0.0,0.210526,0.111111,0.200000,0.751014,0.200000,1.0,1.0,1.0,0.227119,...,0.002764,0.00,0.0,0.000445,0.000000,0.000000,0.000000,0.001232,0.029832,1.0
5398,0.0,0.157895,0.222222,0.500000,0.739780,0.000000,1.0,1.0,0.0,0.329738,...,0.000000,0.00,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000699,0.0
