# Statistical Tests and Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
import plotly.express as px
from plotly.subplots import make_subplots
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
import plotly.graph_objects as go
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
import warnings
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import multiprocessing as mp

warnings.filterwarnings("ignore")


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.


The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).


The sklearn.neighbors.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.



In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd 'drive/My Drive'

/content/drive/My Drive


In [4]:
df = pd.read_csv("data/avazu_ctr/avazu_train.csv")

In [5]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,hour_of_day,day_of_week
0,10015140740686523448,0,2014-10-21 00:00:00,1005,0,85f751fd,c4e18dd6,50e219e0,c51f82bc,d9b5648e,0f2161f8,a99f214a,2d227840,9b5ce758,1,0,21611,320,50,2480,3,297,100111,61,00:00,Tuesday
1,10070328440095985756,1,2014-10-21 00:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,31d95183,711ee120,1,0,15701,320,50,1722,0,35,100084,79,00:00,Tuesday
2,10093977800236804132,1,2014-10-21 00:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,07875ea4,24f6b932,1,0,15704,320,50,1722,0,35,-1,79,00:00,Tuesday
3,10104245282042838695,0,2014-10-21 00:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,cd3c8548,8a4875bd,1,0,15701,320,50,1722,0,35,100084,79,00:00,Tuesday
4,10105971003478261107,0,2014-10-21 00:00:00,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,75bb1b58,2ee63ff8,1,0,15701,320,50,1722,0,35,-1,79,00:00,Tuesday


## 1. Data Cleaning

We know that not all columns are needed for our prediction and hence we should drop columns which are not necessary. We drop hour column since we had earlier already created a **hour_of_day** column along with **day_of_week** column. Since our data is for only spanning 10 days, the month and year trend can't be captured hence no point including them in our analysis. However, we will be interested in the **day** of the week which has higher ad clicks hence, we will day the days. Likewise, how does user activity vary hourly is very crucial for our analysis. We have earlier observed that the minutes and the seconds are not contributing to our analysis since our data was captured hourly daily. Hence we neglect them too. Finally we drop the **id** column too.

In [6]:
# drop unwanted columns using subjective analysis
cols = list(df.columns)
if any(col in cols for col in ["id", "hour"]):
  df = df.drop(["id", "hour"], axis=1)

We would also like to make sure that the feature **hour_of_day** only has hours to represent and not the minutes.

In [7]:
df["hour_of_day"] = df["hour_of_day"].apply(lambda x: int(x.split(":")[0]))
df.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,hour_of_day,day_of_week
0,0,1005,0,85f751fd,c4e18dd6,50e219e0,c51f82bc,d9b5648e,0f2161f8,a99f214a,2d227840,9b5ce758,1,0,21611,320,50,2480,3,297,100111,61,0,Tuesday
1,1,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,31d95183,711ee120,1,0,15701,320,50,1722,0,35,100084,79,0,Tuesday
2,1,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,07875ea4,24f6b932,1,0,15704,320,50,1722,0,35,-1,79,0,Tuesday
3,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,cd3c8548,8a4875bd,1,0,15701,320,50,1722,0,35,100084,79,0,Tuesday
4,0,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,75bb1b58,2ee63ff8,1,0,15701,320,50,1722,0,35,-1,79,0,Tuesday


### 2. Augmenting Categorial features

We need to encode our categorical features into numeric values prior to applying our models. The **day_of_week** feature is **non-ordinal** in nature i.e. it is not having any order of importance. All days are equal. Likewise, other features which are non-integer in named needs to be hashed using inbuilt hash function. Since our data is high in cardinality, we cannot afford to have **one-hot encoding** since it will lead to higher memory consumption. Likewise, **LabelEncoding** i.e. representing categorical data uniquely with integers too is not a good option since it will cause our data to be ordinal in nature even though the variables are non-ordinal.

In [8]:
def convert_obj_to_int(fm):
    
    object_list_columns = fm.columns
    object_list_dtypes = fm.dtypes
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            fm[object_list_columns[index]] = fm[object_list_columns[index]].apply(lambda x: hash(x))
    return fm

In [9]:
df_hashed = convert_obj_to_int(df)
df_hashed.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,hour_of_day,day_of_week
0,0,1005,0,6636073327342313167,-7264455599007738244,-1270317549164266019,-1236225353438669972,458967296862015535,-2926036758986816391,3322046748593014864,-7646517992521781089,-3775073246535163519,1,0,21611,320,50,2480,3,297,100111,61,0,4985160792369302704
1,1,1005,0,9029878845803310522,-1751450577693508950,-3458009864878840098,5245925069379030791,697249892247788833,-501993329772894604,3322046748593014864,-7258891613984259436,4835754152098493122,1,0,15701,320,50,1722,0,35,100084,79,0,4985160792369302704
2,1,1005,0,9029878845803310522,-1751450577693508950,-3458009864878840098,5245925069379030791,697249892247788833,-501993329772894604,3322046748593014864,2859050573829177882,3059407104933253372,1,0,15704,320,50,1722,0,35,-1,79,0,4985160792369302704
3,0,1005,0,9029878845803310522,-1751450577693508950,-3458009864878840098,5245925069379030791,697249892247788833,-501993329772894604,3322046748593014864,2529484916752111719,-3921586124701728923,1,0,15701,320,50,1722,0,35,100084,79,0,4985160792369302704
4,0,1005,0,9029878845803310522,-1751450577693508950,-3458009864878840098,5245925069379030791,697249892247788833,-501993329772894604,3322046748593014864,-722582553155895201,-3118362136100252713,1,0,15701,320,50,1722,0,35,-1,79,0,4985160792369302704


### 3. Scaling our features using Standard Scalar

We will use Standardization to normalize our data instead of Normalization. While Normalization will help us rescale data between 0-1 without and is an ideal choice when we don't know data distribution, it is wise to do Standardization where we center the data around the mean with a standard deviation of 1, to satisfy the assumptions of our models which we eventually would like to implement. 

In [10]:
fs = list(df_hashed.columns)
if 'click' in fs:
  fs.remove('click')
scaler = StandardScaler()
df_hashed[fs] = scaler.fit_transform(df_hashed[fs])
df_hashed.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,hour_of_day,day_of_week
0,0,0.030699,-0.572991,0.587036,-0.880261,-0.600083,-0.867236,-0.348618,-0.710268,0.232425,-1.442235,-0.786082,-0.029088,-0.387155,0.558921,0.055663,-0.21214,0.603074,1.178075,0.196998,0.935665,-0.313272,-1.891048,0.200261
1,1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,-1.369111,0.934412,-0.029088,-0.387155,-0.627567,0.055663,-0.21214,-0.636433,-1.083952,-0.547876,0.935125,-0.056512,-1.891048,0.200261
2,1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,0.53959,0.579488,-0.029088,-0.387155,-0.626965,0.055663,-0.21214,-0.636433,-1.083952,-0.547876,-1.068726,-0.056512,-1.891048,0.200261
3,0,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,0.477419,-0.815356,-0.029088,-0.387155,-0.627567,0.055663,-0.21214,-0.636433,-1.083952,-0.547876,0.935125,-0.056512,-1.891048,0.200261
4,0,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,-0.136068,-0.654867,-0.029088,-0.387155,-0.627567,0.055663,-0.21214,-0.636433,-1.083952,-0.547876,-1.068726,-0.056512,-1.891048,0.200261


In [11]:
df_hashed.describe()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,hour_of_day,day_of_week
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.17126,-1.144335e-14,-2.977798e-14,-6.195348e-15,-4.14802e-15,3.185711e-15,-7.946805e-14,1.053889e-13,1.961295e-14,-4.531193e-15,-8.588685e-18,1.668387e-15,-1.238234e-15,6.85677e-15,-1.544346e-14,-1.924271e-14,2.923356e-14,-2.609059e-14,3.196286e-14,-3.711014e-14,-1.170855e-14,-3.391664e-14,4.256792e-14,3.385203e-13
std,0.376737,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,0.0,-3.639226,-0.572991,-2.390623,-1.291624,-2.399326,-2.54774,-4.389323,-3.944536,-4.631495,-1.739692,-1.874204,-1.922823,-0.3871553,-3.704406,-9.639088,-0.850558,-3.269157,-1.083952,-0.5535619,-1.068726,-1.169139,-1.891048,-1.730536
25%,0.0,0.03069892,-0.572991,-0.6017283,-0.8802608,-0.6000825,0.03897716,-0.2444937,-0.710268,0.232425,-0.8577505,-0.8492229,-0.02908778,-0.3871553,-0.3828414,0.05566338,-0.2121396,-0.4058648,-1.083952,-0.5478758,-1.068726,-0.8553214,-0.8852751,-1.506726
50%,0.0,0.03069892,-0.572991,0.5870364,-0.3378638,-0.6000825,0.5070533,-0.2444937,0.5405979,0.232425,-0.03035695,0.06688163,-0.02908778,-0.3871553,0.3049603,0.05566338,-0.2121396,0.3463422,0.4240659,-0.5365037,0.9344442,-0.3132721,-0.04713134,0.2002609
75%,0.0,0.03069892,1.416699,0.5870364,0.5007077,1.100537,0.5070533,-0.1073489,0.5405979,0.232425,0.862737,0.7568222,-0.02908778,-0.3871553,0.6157359,0.05566338,-0.2121396,0.6782944,1.178075,-0.1612237,0.9353251,0.2430416,0.7910124,0.8385049
max,1.0,6.453067,13.35484,1.070778,2.604638,1.744093,1.350144,3.43114,5.390037,2.520539,1.740182,1.80969,7.545854,5.46996,1.047168,34.18119,20.51518,1.056033,1.178075,4.580951,0.9384084,2.454032,1.964414,0.9096897


### 4. Multi-collinearity test using VIF Analysis and Correlation Heatmap

In [12]:
corr = df_hashed.corr()
cols = list(corr.index)
cm = np.corrcoef(df_hashed[cols].values.T)
fig = go.Figure(data=go.Heatmap(z=cm, x=cols, y=cols, colorscale="Viridis"))
fig.update_layout(height=800, width=800)
fig.show()

From the above correlation heatmap, we can get an idea of which variables correlate the most with other variables. We need to identify quantitatively those variables which can be represented by other variables and hence act as redundant information in our dataset. We therefor apply VIF analysis further to observe the multi-collinearity effects.

Having applied VIF on all the columns in the dataframe, we can observe that feature **C17** has VIF value of **23.399** while **C14** has a VIF value of **23.174**. VIF values start from 1 and has no limits. VIF values greater than 5-10 indicate higher multi-collinearity. Hence we need to drop either one of the columns between **C17** and **C14**. 

Likewise, column **device_type** has VIF value of **5.755** and **C1** has VIF value of **5.366**. Both of them are high correlated with each other. Our heatmap too reveals this. If we observe carefully, variables C17 and C14 are represented by **yellow** squares and likewise, C1 and device_type too have yellow squares. All the 4 yellow squares point to the above discussed 4 features. We need to drop 2 features to remove multi-collinearity problem.

In [13]:
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif

X = df_hashed.iloc[:,1:]
calc_vif(X)

Unnamed: 0,variables,VIF
0,C1,5.528558
1,banner_pos,1.720106
2,site_id,1.862456
3,site_domain,1.727549
4,site_category,2.199974
5,app_id,2.014195
6,app_domain,1.191456
7,app_category,1.911171
8,device_id,1.088644
9,device_ip,1.000087


In [14]:
df_hashed = df_hashed.drop(["C17", "device_type"], axis=1)
df_hashed.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_conn_type,C14,C15,C16,C18,C19,C20,C21,hour_of_day,day_of_week
0,0,0.030699,-0.572991,0.587036,-0.880261,-0.600083,-0.867236,-0.348618,-0.710268,0.232425,-1.442235,-0.786082,-0.387155,0.558921,0.055663,-0.21214,1.178075,0.196998,0.935665,-0.313272,-1.891048,0.200261
1,1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,-1.369111,0.934412,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-0.547876,0.935125,-0.056512,-1.891048,0.200261
2,1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,0.53959,0.579488,-0.387155,-0.626965,0.055663,-0.21214,-1.083952,-0.547876,-1.068726,-0.056512,-1.891048,0.200261
3,0,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,0.477419,-0.815356,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-0.547876,0.935125,-0.056512,-1.891048,0.200261
4,0,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,-0.136068,-0.654867,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-0.547876,-1.068726,-0.056512,-1.891048,0.200261


In [15]:
X = df_hashed.iloc[:,1:]
calc_vif(X)

Unnamed: 0,variables,VIF
0,C1,1.321258
1,banner_pos,1.619498
2,site_id,1.861494
3,site_domain,1.722835
4,site_category,2.169436
5,app_id,2.013991
6,app_domain,1.190211
7,app_category,1.90475
8,device_id,1.067362
9,device_ip,1.00008


In [16]:
df_hashed.head()

Unnamed: 0,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_conn_type,C14,C15,C16,C18,C19,C20,C21,hour_of_day,day_of_week
0,0,0.030699,-0.572991,0.587036,-0.880261,-0.600083,-0.867236,-0.348618,-0.710268,0.232425,-1.442235,-0.786082,-0.387155,0.558921,0.055663,-0.21214,1.178075,0.196998,0.935665,-0.313272,-1.891048,0.200261
1,1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,-1.369111,0.934412,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-0.547876,0.935125,-0.056512,-1.891048,0.200261
2,1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,0.53959,0.579488,-0.387155,-0.626965,0.055663,-0.21214,-1.083952,-0.547876,-1.068726,-0.056512,-1.891048,0.200261
3,0,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,0.477419,-0.815356,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-0.547876,0.935125,-0.056512,-1.891048,0.200261
4,0,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,0.232425,-0.136068,-0.654867,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-0.547876,-1.068726,-0.056512,-1.891048,0.200261


### 5. Randomly shuffle data and split Data and Labels

In [17]:
cols = list(df_hashed.columns)
y = df_hashed["click"]
cols.remove('click')
X = df_hashed[cols]
print("Training Data shape: {}".format(X.shape))
print("Testing Data shape: {}".format(y.shape))

Training Data shape: (200000, 21)
Testing Data shape: (200000,)


### 6. Feature Selection

Here we will select important features which are statistically significant and have good variance so as to correctly describe our data. We will use 2 methods for this work:

#### 6.1 Feature Elimination with Low Variance using VarianceThreshold

In this method, the intuition is to remove features which do not vary across instances. Hence we would like to retain features which are atleast having a minimum threshold of variance. In our case, we intend to discard features which don't have a minimum of 40% variance.

In [18]:
var = VarianceThreshold(threshold=0.4)
var = var.fit(X,y)
sups = var.get_support(indices=True)
sups

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

Having applied Variance Threshold based feature selection method, it looks like none of our data has less variance and that none of the features are eliminated. We will apply one more technique to confirm this. Let us apply **K-Best Fit** feature selection using ANOVA scoring mechanism

#### 6.2 K-Best Fit feature selection using ANOVA scoring

In [19]:
df_hashed[cols] = scaler.fit_transform(df_hashed[cols])
KBest = SelectKBest(score_func=f_classif, k=15)
KBest = KBest.fit(X,y)

In [20]:
df_scores = pd.DataFrame({'features': X.columns, 'f_classif': KBest.scores_, 'pValue': KBest.pvalues_ })
df_scores

Unnamed: 0,features,f_classif,pValue
0,C1,246.596678,1.5476280000000001e-55
1,banner_pos,135.708425,2.3655040000000003e-31
2,site_id,520.297165,5.118703e-115
3,site_domain,680.778735,8.078772e-150
4,site_category,606.433718,1.0576179999999999e-133
5,app_id,1231.992548,4.512765e-269
6,app_domain,1127.960268,1.3529399999999998e-246
7,app_category,832.717007,9.891655999999999e-183
8,device_id,31.496654,2.000476e-08
9,device_ip,0.383848,0.535552


In [21]:
kb = KBest.get_support(indices=True)
kb

array([ 0,  1,  2,  3,  4,  5,  6,  7, 11, 12, 13, 14, 15, 17, 18])

In [22]:
df_reduced = df_hashed[list(map(cols.__getitem__, kb))]
df_reduced.head()

Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_conn_type,C14,C15,C16,C18,C20,C21
0,0.030699,-0.572991,0.587036,-0.880261,-0.600083,-0.867236,-0.348618,-0.710268,-0.387155,0.558921,0.055663,-0.21214,1.178075,0.935665,-0.313272
1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,0.935125,-0.056512
2,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.626965,0.055663,-0.21214,-1.083952,-1.068726,-0.056512
3,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,0.935125,-0.056512
4,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-1.068726,-0.056512


With K-Best features, we decided to use the ANOVA scoring mechanism since our output is categorical and also our input features are have a gaussian distribution due to standardization. Having standardized our data they have negative as well as positive values. For K-Best selection, Chi-Square scoring does not work with negative values, hence we make use of the ANOVA scoring. We can see the P-Values for each feature in our dataset.

P-values help us determine how statistically significant each feature is with respect to target variable. Hence, any value which is less than 0.05 is considered significant and hence those features can be useful. The methods employed here are univariate where a relationship of each variable with target was observed and it's statistical significance was calculated. This is form of supervised feature selection mechanism. 

Univariate feature selection does not consider the interaction between features for its evaluation. It observes effect of each predictor in isolation with the target. This causes mutli-collinearity to exist. That's one of the reason why we performed removed multi-collinearity in the first place so that we can have independent variables which do not have high interaction among features. We finally have our reduced features. 

Since our data is now having only 15 statistically significant features without any effects of multi-collinearity, we can now shift our attention to dealing with data imbalance and removing outliers. 

The approach we followed here was to first tackle the features or the columns in the dataset, after which we can now deal with manipulating our samples so as to have a balanced data. Next up we use automated outlier detection algorithms after which we balance our datasets



### 7. Creating Training, Validation and Test sets


In [23]:
df_reduced.loc[:, "click"] = y
df_reduced.head()

Unnamed: 0,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_conn_type,C14,C15,C16,C18,C20,C21,click
0,0.030699,-0.572991,0.587036,-0.880261,-0.600083,-0.867236,-0.348618,-0.710268,-0.387155,0.558921,0.055663,-0.21214,1.178075,0.935665,-0.313272,0
1,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,0.935125,-0.056512,1
2,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.626965,0.055663,-0.21214,-1.083952,-1.068726,-0.056512,1
3,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,0.935125,-0.056512,0
4,0.030699,-0.572991,1.037178,0.286189,-1.116422,0.507053,-0.244494,0.540598,-0.387155,-0.627567,0.055663,-0.21214,-1.083952,-1.068726,-0.056512,0


In [24]:
def create_train_valid_test_split(dF, test_percent, shuffle=True):

  if shuffle:
    dF = dF.sample(frac = 1).reset_index().drop("index", axis=1)

  cols = list(dF.columns)
  y = dF["click"].to_numpy()
  cols.remove('click')
  X = dF.loc[:, cols].to_numpy()
  print("Data shape before splitting: {}".format(X.shape))
  print("Labels shape before splitting: {}".format(y.shape))

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, random_state=1)
  # X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.12, random_state=1)

  print("Training data shape: {}".format(X_train.shape))
  print("Training labels shapre: {}".format(y_train.shape))
  # print("Validation data shape: {}".format(X_valid.shape))
  # print("Validation labels shape: {}".format(y_valid.shape))
  print("Test data shape: {}".format(X_test.shape))
  print("Test labels shape: {}".format(y_test.shape))
  
  return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = create_train_valid_test_split(df_reduced, 0.10)

Data shape before splitting: (200000, 15)
Labels shape before splitting: (200000,)
Training data shape: (180000, 15)
Training labels shapre: (180000,)
Test data shape: (20000, 15)
Test labels shape: (20000,)


### 8. Automated Outlier Detection and Removal

For outlier detection we will be using 2 methods. In order to evaluate the effectiveness of the outlier, we first create a baseline model using logistic regression and observe the Mean Absolute error (MAE).

In [25]:
model = LogisticRegressionCV(cv=5)
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_train)
# evaluate predictions
mae = mean_absolute_error(y_train, yhat)
print("MAE: {:.3f}".format(mae))

MAE: 0.172


#### 8.1 Using Isolation Forest 

In [39]:
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
mask = yhat != -1
X_iso, y_iso = X_train[mask, :], y_train[mask]

In [40]:
print("Training data shape before outlier detection: {},{}".format(X_train.shape, y_train.shape))
print("Training data shape after outlier detection: {},{}".format(X_iso.shape, y_iso.shape))

Training data shape before outlier detection: (180000, 15),(180000,)
Training data shape after outlier detection: (162002, 15),(162002,)


In [41]:
model = LogisticRegressionCV(cv=5)
model.fit(X_iso, y_iso)
# evaluate the model
yhat = model.predict(X_iso)
# evaluate predictions
mae_iso = mean_absolute_error(y_iso, yhat)
print("MAE before Isolation Forest: {:.3f}, MAE after Isolation Forest: {:.3f}".format(mae, mae_iso))

MAE before Isolation Forest: 0.172, MAE after Isolation Forest: 0.163


In [42]:
iso_clicks = pd.DataFrame(y_iso.tolist(), columns=["click"])
orig = pd.DataFrame(y_train.tolist(), columns=["click"])
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=y_train.tolist()), row=1, col=1)
fig.add_trace(go.Histogram(x=y_iso.tolist()), row=1, col=2)
fig.update_layout(height=600, width=1000, title_text="Click Histogram before and after Isolation Forest Outlier Detection")
fig.show()

#### 8.2 Using Minimum Covariance Determinant

In [43]:
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
mask = yhat != -1
X_mcd, y_mcd = X_train[mask, :], y_train[mask]

In [44]:
print("Training data shape before outlier detection: {},{}".format(X_train.shape, y_train.shape))
print("Training data shape after outlier detection: {},{}".format(X_mcd.shape, y_mcd.shape))

Training data shape before outlier detection: (180000, 15),(180000,)
Training data shape after outlier detection: (178676, 15),(178676,)


In [45]:
model = LogisticRegressionCV(cv=5)
model.fit(X_mcd, y_mcd)
# evaluate the model
yhat = model.predict(X_mcd)
# evaluate predictions
mae_mcd = mean_absolute_error(y_mcd, yhat)
print("MAE before Isolation Forest: {:.3f}, MAE after Isolation Forest: {:.3f}".format(mae, mae_mcd))

MAE before Isolation Forest: 0.172, MAE after Isolation Forest: 0.173


In [46]:
# mcd_clicks = pd.DataFrame(y_mcd.tolist(), columns=["click"])
# orig = pd.DataFrame(y_train.tolist(), columns=["click"])
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=y_train.tolist()), row=1, col=1)
fig.add_trace(go.Histogram(x=y_mcd.tolist()), row=1, col=2)
fig.update_layout(height=600, width=1000, title_text="Click Histogram before and after Minimum Covariance Determinant Outlier Detection")
fig.show()

### 9. Balancing imbalanced Data using SMOTE Analysis

We first create a baseline model for our evaluation and see the effects of SMOTE on it.

In [47]:
model = DecisionTreeClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
print("Mean ROC AUC: {:.3f}".format(np.mean(scores)))

Mean ROC AUC: 0.671


#### 9.1 SMOTE Analysis with Random Under-Sampling



In [48]:
model = DecisionTreeClassifier()
over = SMOTE(random_state=2, sampling_strategy=0.4, k_neighbors=1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
Xn, yn = pipeline.fit_resample(X_iso, y_iso.ravel())
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, Xn, yn, scoring='roc_auc', cv=cv, n_jobs=-1)
score = np.mean(scores)
print("k={}, Mean ROC AUC: {:.3f}".format(3, score))

k=3, Mean ROC AUC: 0.739


In [49]:
Xn.shape

(162666, 15)

In [50]:
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Histogram(x=y_train.tolist()), row=1, col=1)
fig.add_trace(go.Histogram(x=yn.tolist()), row=1, col=2)
fig.update_layout(height=600, width=1000, title_text="Click Histogram before and after Minimum Covariance Determinant Outlier Detection")
fig.show()

### 10. Saving Processed Data

We are saving our training and testing splits with labels so that they can be further used for predictions

In [51]:
np.save("train.npy", Xn)
np.save("train_labels.npy", yn)
np.save("test.npy", X_test)
np.save("test_labels.npy", y_test)
!cp train.npy train_labels.npy test.npy test_labels.npy "data/avazu_ctr/"