In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

In [2]:
X_train = pd.read_csv('/Users/patrycjapiechowicz/CYBER/xtrain.csv')
X_test = pd.read_csv('/Users/patrycjapiechowicz/CYBER/xtest.csv')

In [3]:
# I keep a copy of the dataset with all the variables
# to measure the performance of machine learning models

X_train_original = X_train.copy()
X_test_original = X_test.copy()

In [4]:
y_train = X_train['label'].values.reshape(-1,1)
y_test = X_test['label'].values.reshape(-1,1)

In [5]:
X_train = X_train.drop('label',axis=1)
X_test =X_test.drop('label',axis=1)

In [6]:
y_train.shape,y_test.shape, X_train.shape, X_test.shape

((18000, 1), (2000, 1), (18000, 180), (2000, 180))

In [7]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

#### remove quasi-constant features

In [8]:
# remove quasi-constant features
sel = VarianceThreshold(
    threshold=0.001)  # 0.01 indicates 99,9% of observations approximately

sel.fit(X_train)  # fit finds the features with low variance

sum(sel.get_support()) # how many not quasi-constant?

172

In [9]:
features_to_keep = X_train.columns[sel.get_support()]

In [10]:
# we can then remove the features:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((18000, 172), (2000, 172))

In [11]:
# I keep a copy of the dataset except constant and duplicated variables
# to measure the performance of machine learning model

X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

In [12]:
# here I transform the arrays back to dataframes

X_train= pd.DataFrame(X_train)
X_train.columns = features_to_keep

X_test= pd.DataFrame(X_test)
X_test.columns = features_to_keep

#### remove correlated features

In [13]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.9)
print('correlated features: ', len(set(corr_features)) )

correlated features:  91


In [14]:
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((18000, 81), (2000, 81))

In [15]:
# create a function to build random forests and compare performance in train and test set

def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [16]:
# original
run_randomForests(X_train_original.drop(labels=['label'], axis=1),
                  X_test_original.drop(labels=['label'], axis=1),
                  y_train, y_test)

  """


Train set
Random Forests roc-auc: 0.9381711020795165
Test set
Random Forests roc-auc: 0.934856231821437


In [17]:
# filter methods - remove quasi
run_randomForests(X_train_basic_filter,
                  X_test_basic_filter,
                  y_train, y_test)

  """


Train set
Random Forests roc-auc: 0.9387507592630087
Test set
Random Forests roc-auc: 0.9360454649111225


In [18]:
# filter methods - correlation
run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

  """


Train set
Random Forests roc-auc: 0.9458391152548902
Test set
Random Forests roc-auc: 0.9414555252829554


We can see that removing correlated feature improve performence of the model

#### check the same for logistic regression

In [19]:
# create a function to build logistic regression and compare performance in train and test set

def run_logistic(X_train, X_test, y_train, y_test):
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [20]:
# original

run_logistic(X_train_original.drop(labels=['label'], axis=1),
             X_test_original.drop(labels=['label'], axis=1),
                  y_train, y_test)

  return f(**kwargs)


Train set
Logistic Regression roc-auc: 0.961315129951259
Test set
Logistic Regression roc-auc: 0.9573406387651979


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
# filter methods - remove quasi

run_logistic(X_train_basic_filter,
             X_test_basic_filter,
                  y_train, y_test)

  return f(**kwargs)


Train set
Logistic Regression roc-auc: 0.9613027595198
Test set
Logistic Regression roc-auc: 0.9581968065740886


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
# filter methods - correlation
run_logistic(X_train,
             X_test,
                  y_train, y_test)

  return f(**kwargs)


Train set
Logistic Regression roc-auc: 0.9500807905224223
Test set
Logistic Regression roc-auc: 0.9495931202515693


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [26]:
X_train_basic_filter =pd.DataFrame(X_train_basic_filter)

In [27]:
selected_feats=X_train_basic_filter.columns

In [28]:
len(selected_feats)

172

In [29]:
pd.Series(selected_feats).to_csv('selected_features.csv', index=False)