In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OrdinalEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV


from sklearn.feature_selection import SelectKBest,chi2

from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


from sklearn.metrics import accuracy_score,roc_auc_score,confusion_matrix,classification_report,f1_score


from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import RidgeClassifier,SGDClassifier
from xgboost import XGBClassifier

from sklearn.neural_network import MLPClassifier

In [2]:
df=pd.read_csv("online_shoppers_intention.csv")

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [5]:
df["Weekend"]=df["Weekend"].replace([True,False],[1,0])
df["Revenue"]=np.where(df["Revenue"]==True,1,0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [7]:
df["VisitorType"].unique()

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

In [8]:
df["Returning_Visitor"]=np.where(df["VisitorType"]=="Returning_Visitor",1,0)

In [9]:
df.drop(columns=["VisitorType"],inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [11]:
df["Month"].unique()

array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
       'Dec'], dtype=object)

In [12]:
fe=OrdinalEncoder()
df["Month"]=fe.fit_transform(df[["Month"]])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  float64
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [14]:
corr=df.corr()

In [15]:
corr

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,Returning_Visitor
Administrative,1.0,0.601583,0.37685,0.255848,0.431119,0.373939,-0.223563,-0.316483,0.09899,-0.094778,0.04856,-0.006347,-0.025035,-0.005487,-0.033561,0.026417,0.138917,-0.022884
Administrative_Duration,0.601583,1.0,0.30271,0.238031,0.289087,0.355422,-0.14417,-0.205798,0.067608,-0.073304,0.029061,-0.007343,-0.015392,-0.005561,-0.014376,0.01499,0.093587,-0.022525
Informational,0.37685,0.30271,1.0,0.618955,0.374164,0.387505,-0.116114,-0.163666,0.048632,-0.048219,0.019743,-0.009527,-0.038235,-0.029169,-0.034491,0.035785,0.0952,0.057399
Informational_Duration,0.255848,0.238031,0.618955,1.0,0.280046,0.347364,-0.074067,-0.105276,0.030861,-0.030577,0.005987,-0.009579,-0.019285,-0.027144,-0.024675,0.024078,0.070345,0.045501
ProductRelated,0.431119,0.289087,0.374164,0.280046,1.0,0.860927,-0.204578,-0.292526,0.056282,-0.023958,0.070299,0.00429,-0.013146,-0.038122,-0.043064,0.016092,0.158538,0.128738
ProductRelated_Duration,0.373939,0.355422,0.387505,0.347364,0.860927,1.0,-0.184541,-0.251984,0.052823,-0.03638,0.061186,0.002976,-0.00738,-0.033091,-0.036377,0.007311,0.152373,0.120489
BounceRates,-0.223563,-0.14417,-0.116114,-0.074067,-0.204578,-0.184541,1.0,0.913004,-0.119386,0.072702,-0.023763,0.023823,-0.015772,-0.006485,0.078286,-0.046514,-0.150673,0.129908
ExitRates,-0.316483,-0.205798,-0.163666,-0.105276,-0.292526,-0.251984,0.913004,1.0,-0.174498,0.102242,-0.039049,0.014567,-0.004442,-0.008907,0.078616,-0.062587,-0.207071,0.171987
PageValues,0.09899,0.067608,0.048632,0.030861,0.056282,0.052823,-0.119386,-0.174498,1.0,-0.063541,0.02178,0.018508,0.045592,0.011315,0.012532,0.012002,0.492569,-0.115825
SpecialDay,-0.094778,-0.073304,-0.048219,-0.030577,-0.023958,-0.03638,0.072702,0.102242,-0.063541,1.0,0.079341,0.012652,0.003499,-0.016098,0.052301,-0.016767,-0.082305,0.087123


In [16]:
corr["Revenue"]

Administrative             0.138917
Administrative_Duration    0.093587
Informational              0.095200
Informational_Duration     0.070345
ProductRelated             0.158538
ProductRelated_Duration    0.152373
BounceRates               -0.150673
ExitRates                 -0.207071
PageValues                 0.492569
SpecialDay                -0.082305
Month                      0.080150
OperatingSystems          -0.014668
Browser                    0.023984
Region                    -0.011595
TrafficType               -0.005113
Weekend                    0.029295
Revenue                    1.000000
Returning_Visitor         -0.103843
Name: Revenue, dtype: float64

In [17]:
corr["Revenue"].sort_values(ascending=0)

Revenue                    1.000000
PageValues                 0.492569
ProductRelated             0.158538
ProductRelated_Duration    0.152373
Administrative             0.138917
Informational              0.095200
Administrative_Duration    0.093587
Month                      0.080150
Informational_Duration     0.070345
Weekend                    0.029295
Browser                    0.023984
TrafficType               -0.005113
Region                    -0.011595
OperatingSystems          -0.014668
SpecialDay                -0.082305
Returning_Visitor         -0.103843
BounceRates               -0.150673
ExitRates                 -0.207071
Name: Revenue, dtype: float64

In [18]:
X=df.drop(columns=["Revenue"])
y=df["Revenue"]

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)

In [20]:
print("Step 9: model_pipeline fcuntion created done successfully")

def model_pipeline(X, model):
    n_c = X.select_dtypes(exclude=['object']).columns.tolist()
    c_c = X.select_dtypes(include=['object']).columns.tolist()

    numeric_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant')),
        ('scaler', MinMaxScaler())
    ])

    categorical_pipeline = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('numeric', numeric_pipeline, n_c),
        ('categorical', categorical_pipeline, c_c)
    ], remainder='passthrough')

    final_steps = [
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=1)),
        ('feature_selection', SelectKBest(score_func = chi2, k = 6)),
        ('model', model)
    ]

    return imbpipeline(steps = final_steps)  # Ensure to use IMBPipeline if using imblearn's Pipeline






##################
# Model Selection#
##################


print("Step 10: select_model fcuntion created done successfully")


def select_model(X, y, pipeline=None):

    classifiers = {}
    

    c_d1 = {"RandomForestClassifier": RandomForestClassifier()}
    classifiers.update(c_d1)

    c_d2 = {"DecisionTreeClassifier": DecisionTreeClassifier()}
    classifiers.update(c_d2)

    c_d3 = {"KNeighborsClassifier": KNeighborsClassifier()}
    classifiers.update(c_d3)

    c_d4 = {"RidgeClassifier": RidgeClassifier()}
    classifiers.update(c_d4)

    c_d5 = {"BernoulliNB": BernoulliNB()}
    classifiers.update(c_d5)

    c_d6 = {"SVC": SVC()}
    classifiers.update(c_d6)
    
    
   
    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns = cols)

    for key in classifiers:
        
        start_time = time.time()
        
        print()
        print("Step 12: model_pipeline run successfully on", key)

        pipeline = model_pipeline(X, classifiers[key])
        
        cv = cross_val_score(pipeline, X, y, cv=10, scoring='roc_auc')

        row = {'model': key,
               'run_time': format(round((time.time() - start_time)/60,2)),
               'roc_auc': cv.mean(),
        }

        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)
        
    df_models = df_models.sort_values(by='roc_auc', ascending=False)
	
    return df_models
    



Step 9: model_pipeline fcuntion created done successfully
Step 10: select_model fcuntion created done successfully


In [21]:
model=select_model(X_train,y_train)


Step 12: model_pipeline run successfully on RandomForestClassifier

Step 12: model_pipeline run successfully on DecisionTreeClassifier

Step 12: model_pipeline run successfully on KNeighborsClassifier

Step 12: model_pipeline run successfully on RidgeClassifier

Step 12: model_pipeline run successfully on BernoulliNB

Step 12: model_pipeline run successfully on SVC


In [22]:
print(model)

                    model run_time   roc_auc
5                     SVC     2.32  0.886132
0  RandomForestClassifier     0.97  0.885957
4             BernoulliNB     0.02  0.864141
3         RidgeClassifier     0.02  0.857193
2    KNeighborsClassifier     0.03  0.834569
1  DecisionTreeClassifier     0.05  0.730749


In [23]:
selected_model=SVC()
model1=model_pipeline(X_train,selected_model)
model1.fit(X_train,y_train)

In [24]:
y_pred=model1.predict(X_test)

In [25]:
roc_score=roc_auc_score(y_test,y_pred)
print("roc_score is:",roc_score)
print()
accuracy=accuracy_score(y_pred,y_test)
print("roc_score is:",roc_score)
print()
f1_score=f1_score(y_pred,y_test)
print("f1 score is:",f1_score)

roc_score is: 0.7863494059284846

roc_score is: 0.7863494059284846

f1 score is: 0.6438746438746439


In [26]:
df2=pd.DataFrame({"actual":y_test,"predicted":y_pred})

df2.to_csv("online_shoppers_intension_output.csv")

In [27]:
df2.head()

Unnamed: 0,actual,predicted
12245,0,0
9704,0,0
9177,0,0
8848,0,0
2768,0,0
