In [21]:
#importing modules
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [12]:
#importing under sampling dataset
ecommerce_df = pd.read_csv(Path('Resources/Cleaned_data/under_sampling_data.csv'))

In [13]:
#looking the few rows
ecommerce_df.head()

Unnamed: 0,Administrative_Avg,Informational_Avg,ProductRelated_Avg,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,44.541667,40.666667,53.708333,0.0,0.018182,0.0,0.0,Dec,2,2,2,1,Returning_Visitor,False,False
1,0.0,0.0,53.166667,0.0,0.05,0.0,0.0,Mar,1,1,6,1,Returning_Visitor,True,False
2,0.0,0.0,220.678571,0.014286,0.033333,0.0,0.0,Dec,1,1,3,1,Returning_Visitor,False,False
3,5.666667,0.0,24.086352,0.009524,0.040317,0.0,0.0,Dec,2,2,1,2,Returning_Visitor,False,False
4,22.26,0.0,30.232723,0.0,0.008337,17.634346,0.0,Nov,2,2,9,2,Returning_Visitor,False,False


In [14]:
ecommerce_df["Revenue"].value_counts()

False    1908
True     1908
Name: Revenue, dtype: int64

In [15]:
#shape of dataframe
ecommerce_df.shape

(3816, 15)

In [16]:
# create feature matrix (X)
#selecting all columns except 'Revenue'
X=ecommerce_df.drop('Revenue', axis=1) 

# create response vector (y)
##selecting 'Revenue' column
y=ecommerce_df['Revenue'].values 

In [17]:
# Seperating categorical columns
cat_columns=['Month','OperatingSystems', 'Browser', 'Region',
             'TrafficType','VisitorType','Weekend']

In [18]:
# One-hot encoding the entire dataframe
X_dummies = pd.get_dummies(X, columns = cat_columns)
print(X_dummies.columns)
X_dummies.head()

Index(['Administrative_Avg', 'Informational_Avg', 'ProductRelated_Avg',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month_Aug',
       'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June', 'Month_Mar',
       'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
       'OperatingSystems_1', 'OperatingSystems_2', 'OperatingSystems_3',
       'OperatingSystems_Other', 'Browser_1', 'Browser_10', 'Browser_2',
       'Browser_4', 'Browser_5', 'Browser_6', 'Browser_Other', 'Region_1',
       'Region_2', 'Region_3', 'Region_4', 'Region_5', 'Region_6', 'Region_7',
       'Region_8', 'Region_9', 'TrafficType_1', 'TrafficType_10',
       'TrafficType_11', 'TrafficType_13', 'TrafficType_2', 'TrafficType_3',
       'TrafficType_4', 'TrafficType_5', 'TrafficType_6', 'TrafficType_8',
       'TrafficType_Other', 'VisitorType_New_Visitor', 'VisitorType_Other',
       'VisitorType_Returning_Visitor', 'Weekend_False', 'Weekend_True'],
      dtype='object')


Unnamed: 0,Administrative_Avg,Informational_Avg,ProductRelated_Avg,BounceRates,ExitRates,PageValues,SpecialDay,Month_Aug,Month_Dec,Month_Feb,...,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_8,TrafficType_Other,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_False,Weekend_True
0,44.541667,40.666667,53.708333,0.0,0.018182,0.0,0.0,0,1,0,...,0,0,0,0,0,0,0,1,1,0
1,0.0,0.0,53.166667,0.0,0.05,0.0,0.0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,0.0,0.0,220.678571,0.014286,0.033333,0.0,0.0,0,1,0,...,0,0,0,0,0,0,0,1,1,0
3,5.666667,0.0,24.086352,0.009524,0.040317,0.0,0.0,0,1,0,...,0,0,0,0,0,0,0,1,1,0
4,22.26,0.0,30.232723,0.0,0.008337,17.634346,0.0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [19]:
#Total columns before and after one-hot encoding
print(f'Total column before one hot encoding: {X.shape[1]}, Total columns after one hot encoding: {X_dummies.shape[1]}')

Total column before one hot encoding: 14, Total columns after one hot encoding: 53


In [22]:
# Converting output labels to 0 and 1
y_label = LabelEncoder().fit_transform(y)
y_label

array([0, 0, 0, ..., 1, 1, 1])

In [23]:
#Training set**: Used to train the classifier.
#Testing set**: Used to estimate the error rate of the trained classifier.
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label, random_state=42)

In [24]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.25984905, -0.26915436,  1.38554087, ...,  0.48224282,
         0.55717103, -0.55717103],
       [-0.46387811,  0.31627294, -0.34015459, ...,  0.48224282,
         0.55717103, -0.55717103],
       [-0.46387811, -0.26915436, -1.05942478, ...,  0.48224282,
         0.55717103, -0.55717103],
       ...,
       [-0.46387811, -0.26915436,  0.22058076, ...,  0.48224282,
         0.55717103, -0.55717103],
       [-0.46387811, -0.26915436,  0.07184404, ..., -2.07364414,
         0.55717103, -0.55717103],
       [-0.02820316,  0.58319748, -0.37365499, ...,  0.48224282,
         0.55717103, -0.55717103]])

In [25]:
# Transforming the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 0.21621873, -0.26915436, -0.57945596, ..., -2.07364414,
         0.55717103, -0.55717103],
       [-0.13777792, -0.26915436, -0.40758609, ...,  0.48224282,
         0.55717103, -0.55717103],
       [ 0.42957387,  0.35502945, -0.47588607, ...,  0.48224282,
         0.55717103, -0.55717103],
       ...,
       [ 0.45273756, -0.26915436,  0.74889613, ...,  0.48224282,
        -1.79478103,  1.79478103],
       [ 2.42106239, -0.26915436,  0.1023663 , ...,  0.48224282,
        -1.79478103,  1.79478103],
       [ 0.16747686,  0.07506465, -0.45772341, ...,  0.48224282,
         0.55717103, -0.55717103]])

In [None]:
#defining function for model scoring with training set only
def train_score(model, X_train_scaled=X_train_scaled, y_train=y_train):
    #printing the accuracy score
    train_score=model.score(X_train_scaled, y_train)
    print("******************Train Score******************")
    print(train_score)
    print()
    
    cvs=cross_val_score(model,X_train_scaled, y_train,cv=3, scoring='accuracy' )
    print("************Using K=3 cross validation**********")
    print(cvs)
    print()
    
    #predection on the test set and confusion matrix
    y_train_pred=cross_val_predict(model,X_train_scaled, y_train,cv=3)
    cm=confusion_matrix(y_train, y_train_pred)
    print("******Confusion Matrix using cross_val_predict****")
    print(cm)
    print()
    
    #classification report
    cr=classification_report(y_train, y_train_pred)
    print("***********Classification Report******************")
    print(cr)
    print()
    
    #roc curve
    y_train_predc=cross_val_predict(model,X_train_scaled, y_train,cv=3,
                                    method="predict_proba")
    y_proba = y_train_predc[:,1]  #proba of positive class
    fpr, tpr, thresholds = roc_curve(y_train, y_proba)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f"roc curve for {model}")
    print("ROC Curve")
    plt.show()
    
    # area under the curve (AUC)
    roc_auc = auc(fpr, tpr)
    roc_auc
    print()
    print("**************Area under the curve******************")
    print(roc_auc)
    
    
    
#defining function for model scoring with test set
def test_score(model, X_train_scaled=X_train_scaled, X_test_scaled=X_test_scaled, 
               y_train=y_train,y_test=y_test):
    #printing the accuracy score
    train_score=model.score(X_train_scaled, y_train)
    test_score=model.score(X_test_scaled, y_test)
    print("********************Train Score******************")
    print(train_score)
    print()

    print("*********************Test Score******************")
    print(test_score)
    print()
    
    #predection on the test set and confusion matrix
    y_pred=model.predict(X_test_scaled)
    cm=confusion_matrix(y_test, y_pred)
    print("**************Confusion Matrix******************")
    print(cm)
    print()
    
    #classification report
    cr=classification_report(y_test, y_pred)
    print("**********Classification Report******************")
    print(cr)
    print()
    
    #roc curve
    y_proba = model.predict_proba(X_test_scaled)[:,1]  #proba of positive class
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f"roc curve for {model}")
    print("ROC Curve")
    plt.show()
    
    # area under the curve (AUC)
    roc_auc = auc(fpr, tpr)
    roc_auc
    print()
    print("**************Area under the curve******************")
    print(roc_auc)
