In [1]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [2]:
df = pd.read_csv('online_shoppers_intention.csv')

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
df['Revenue'].value_counts()

False    10422
True      1908
Name: Revenue, dtype: int64

Things to look at:
* SMOTE
* Bagging, boosting and stacking and Voting
* Logistic regression
* Decision Tree
* Random Forest
* SVM
* PCA
* Clustering


In [5]:
df['VisitorType'].value_counts()

Returning_Visitor    10551
New_Visitor           1694
Other                   85
Name: VisitorType, dtype: int64

In [6]:
# removing the 'Other' category from 'VisitorType'
df = df.set_index('VisitorType')
df = df.drop(['Other'], axis=0)
df = df.reset_index()
print(df['VisitorType'].value_counts())

df_vt = pd.get_dummies(df['VisitorType'])
df = df.merge(df_vt, left_index = True, right_index=True)

df_w = pd.get_dummies(df['Weekend'])
df = df.merge(df_w, left_index = True, right_index=True)

df_r = pd.get_dummies(df['Revenue'])
df = df.merge(df_r, left_index = True, right_index=True)

df = df.drop(['VisitorType', 'Returning_Visitor', 'Weekend', 'Revenue', 'False_x', 'False_y'], axis=1)
df = df.rename(columns={"True_x": "Weekend", "True_y": "Revenue"})

month = {'Feb':2, 'Mar':3, 'May':5, 'Oct':10, 
         'June':6, 'Jul':7, 'Aug':8, 'Nov':11, 
         'Sep':9, 'Dec':12}
df.Month = [month[item] for item in df.Month]

Returning_Visitor    10551
New_Visitor           1694
Name: VisitorType, dtype: int64


In [7]:
df

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,New_Visitor,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,0,0,0
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,0,0,0
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,0,0,0
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,0,0,0
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12240,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,12,4,6,1,1,0,1,0
12241,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,11,3,2,1,8,0,1,0
12242,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,11,3,2,1,13,0,1,0
12243,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,11,2,2,3,11,0,0,0


In [8]:
y = df['Revenue']
X = df[['Administrative','Administrative_Duration',
        'Informational', 'Informational_Duration', 
        'ProductRelated', 'ProductRelated_Duration', 
        'BounceRates', 'ExitRates','PageValues',
        'SpecialDay','Month','OperatingSystems','Browser',
        'Region', 'TrafficType', 'Weekend']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [9]:
# SMOTE - to correct for imbalanced class
print('Original class distribution: \n')
print(y.value_counts())
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts()) 

Original class distribution: 

0    10353
1     1892
Name: Revenue, dtype: int64
-----------------------------------------
Synthetic sample class distribution: 

1    7768
0    7768
Name: Revenue, dtype: int64


In [10]:
X_train_resampled

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend
0,0,0.000000,0,0.0,18,545.500000,0.000000,0.011111,0.000000,0.0,11,2,2,1,3,0
1,0,0.000000,0,0.0,6,86.533333,0.000000,0.066667,0.000000,0.0,10,1,1,7,2,1
2,0,0.000000,0,0.0,5,125.000000,0.086667,0.166667,0.000000,0.0,2,2,2,1,3,0
3,0,0.000000,0,0.0,5,1620.000000,0.140000,0.166667,0.000000,0.0,3,2,2,1,1,0
4,1,18.000000,1,6.0,122,7141.214642,0.012927,0.019625,0.000000,0.0,11,3,2,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15531,6,106.646707,0,0.0,4,69.690347,0.000000,0.022222,0.000000,0.0,9,1,3,5,2,0
15532,2,32.833464,0,0.0,58,1190.959902,0.000000,0.020296,0.000000,0.0,9,1,1,1,1,0
15533,3,68.503549,0,0.0,26,1357.770156,0.006525,0.020827,50.441641,0.0,5,2,2,1,2,1
15534,0,0.000000,0,0.0,50,2120.004646,0.000000,0.006423,20.644116,0.0,4,2,2,1,10,0
