In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score, recall_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import datasets, linear_model, metrics
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
shop = pd.read_csv("online_shoppers_intention.csv")

shop['Weekend'] = shop['Weekend'].map({False: 0, True: 1})
shop['Revenue'] = shop['Revenue'].map({False: 0, True: 1})
shop['Month'] = shop['Month'].map({'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12})
shop['VisitorType'] = shop['VisitorType'].map({'Returning_Visitor': 1, 'New_Visitor': 0})
shop.dropna(inplace = True)

In [3]:
Y = shop['Revenue']
shop.drop(['Revenue'], axis = 1, inplace = True)
X = shop

<h1> Decision Tree Classifier </h1>

In [4]:
def run_decision_tree(X,Y, scoring = True, heatmap = False):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    clf = DecisionTreeClassifier()

    param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
                  'max_depth': [i for i in range(5)],
                  'min_samples_split': [2,5,10,20]}

    grid_search = GridSearchCV(clf, param_grid)           
    grid_search.fit(x_train, y_train)

    # finding best parameters
    new_param = grid_search.best_params_
    for item, key in new_param.items():
            new_param[item] = [key]

    # making predictions
    final_clf = GridSearchCV(clf, new_param)
    final_clf.fit(x_train, y_train)
    predictions = final_clf.predict(x_test)

    # storing model and scores
    model = "Decision Tree"
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)
    
    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
        
    if scoring == True:
        return model, score_recall, f1_scores

<h1> Random Forest Classifier </h1>

In [5]:
def run_random_forest(X,Y, scoring = True, heatmap = False):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    clf = RandomForestClassifier()
        
    param_grid = {'n_estimators': [10,50,100,200],
                  'max_depth': [i for i in range(5)],
                  'min_samples_split': [2,5,10,20]}

    grid_search = GridSearchCV(clf, param_grid)           
    grid_search.fit(x_train, y_train)

    # finding best parameters
    new_param = grid_search.best_params_
    for item, key in new_param.items():
            new_param[item] = [key]

    # making predictions
    final_clf = GridSearchCV(clf, new_param)
    final_clf.fit(x_train, y_train)
    predictions = final_clf.predict(x_test)

    # storing models and scores
    model = 'Random Forest'
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)
    
    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
    
    if scoring == True:
        return model, score_recall, f1_scores

<h1> Logistic Regression </h1>

In [6]:
def run_logistic_regr(X,Y, scoring = True, heatmap = False):  
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train, y_train)
    predictions = logisticRegr.predict(x_test)

    cm = metrics.confusion_matrix(y_test, predictions)

    # storing models and scores
    model = 'Logistic Regression'
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)

    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
    
    if scoring == True:
        return model, score_recall, f1_scores

<h1> KNeighbors Classifier </h1>

In [7]:
def run_knn(X,Y, scoring = True, heatmap = False):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    predictions = knn.predict(x_test)
    cm = metrics.confusion_matrix(y_test, predictions)

    # storing models and scores
    model = 'K-Nearest Neighbors'
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)

    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
    
    if scoring == True:
        return model, score_recall, f1_scores

<h1> Gaussian Naive Bayes </h1>

In [8]:
def run_gnb(X,Y, scoring = True, heatmap = False):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    predictions = gnb.predict(x_test)

    # storing models and scores
    model = 'Gaussian Naive Bayes'
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)

    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
    
    if scoring == True:
        return model, score_recall, f1_scores

<h1> Stochastic Gradient Descent </h1>

In [9]:
def run_sgd(X,Y, scoring = True, heatmap = False):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    clf = SGDClassifier()
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)

    # storing models and scores
    model = 'Stochastic Gradient Descent'
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)

    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
    
    if scoring == True:
        return model, score_recall, f1_scores

<h1> Support Vector Classification </h1>

In [10]:
def run_svc(X,Y, scoring = True, heatmap = False):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    scoring: Boolean variable to indicate whether to add scoring values
    heatmap: Boolean value to indicat whether to plot heatmap
    '''
    
    x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
    
    clf = SVC()
    clf.fit(x_train, y_train)
    predictions = clf.predict(x_test)

    # storing models and scores
    model = 'Support Vector'
    score_recall = round(recall_score(predictions, y_test),3)
    f1_scores = round(f1_score(y_test, predictions),3)

    if heatmap == True:
        # heatmap plot to visualize predictions
        cm = metrics.confusion_matrix(y_test, predictions)
        plt.figure(figsize=(10,10))
        sns.heatmap(cm, annot = True, fmt = ".0f", square = True, cmap = 'icefire');
        plt.ylabel('Actual Label');
        plt.xlabel('Predicted Label');
        all_sample_title = f'{model} \n F1 {f1_scores} \n Recall: {score_recall}'
        plt.title(all_sample_title, size = 20);
    
    if scoring == True:
        return model, score_recall, f1_scores

In [11]:
def run_all(X,Y):
    '''
    x_train, y_train: training values from the dataset
    x_test, y_test: testing values from the dataset to predict on 
    '''
    models = []
    f1s = []
    recalls = []
    
    model_function_names = [run_decision_tree, run_random_forest, run_logistic_regr, 
                           run_knn, run_gnb, run_sgd, run_svc]
    
    for name in model_function_names:
        model_name, f1, recall = name(X,Y)
        models.append(model_name)
        f1s.append(f1)
        recalls.append(recall)
    
    scores = {'Classification Model': models, 'F1 Score': f1s, 'Recall Score': recalls}
    scores_df = pd.DataFrame.from_dict(scores)
    return scores_df

In [12]:
scores_df = run_all(X,Y)
scores_df

Unnamed: 0,Classification Model,F1 Score,Recall Score
0,Decision Tree,0.723,0.584
1,Random Forest,0.854,0.514
2,Logistic Regression,0.713,0.475
3,K-Nearest Neighbors,0.585,0.407
4,Gaussian Naive Bayes,0.478,0.499
5,Stochastic Gradient Descent,0.891,0.345
6,Support Vector,1.0,0.016


In [13]:
def run_one(model, n, X,Y):

    f1s = []
    recalls = []
    for i in range(n):
        model_name, f1, recall = model(X,Y)
        f1s.append(f1)
        recalls.append(recall)
        
    scores = {'F1 Score': f1s, 'Recall Score': recalls}
    scores_df = pd.DataFrame.from_dict(scores)
    
    scores_df.style.set_table_attributes("style='display:inline'").set_caption(f'{model} Classification: {n} runs')
    return scores_df

In [14]:
df = run_one(run_decision_tree, 5, X,Y)
df

Unnamed: 0,F1 Score,Recall Score
0,0.686,0.662
1,0.66,0.651
2,0.706,0.69
3,0.674,0.675
4,0.71,0.638


<h1> Expirementing with Training Data </h1>

In [15]:
shop2 = pd.read_csv("online_shoppers_intention.csv")
shop2['Weekend'] = shop2['Weekend'].map({False: 0, True: 1})
shop2['Revenue'] = shop2['Revenue'].map({False: 0, True: 1})
shop2['Month'] = shop2['Month'].map({'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12})
shop2['VisitorType'] = shop2['VisitorType'].map({'Returning_Visitor': 1, 'New_Visitor': 0})
shop2.dropna(inplace = True)
purchases = shop2[shop2['Revenue'] == 1]
non_purchases = shop2[shop2['Revenue'] == 0].sample(n = len(purchases))
new_data = pd.concat([purchases, non_purchases])

In [16]:
new_Y = shop2['Revenue']
shop2.drop(['Revenue'], axis = 1, inplace = True)
new_X = shop2

In [17]:
new_df = run_all(new_X, new_Y)
new_df

Unnamed: 0,Classification Model,F1 Score,Recall Score
0,Decision Tree,0.684,0.683
1,Random Forest,0.743,0.621
2,Logistic Regression,0.797,0.56
3,K-Nearest Neighbors,0.547,0.371
4,Gaussian Naive Bayes,0.46,0.493
5,Stochastic Gradient Descent,0.719,0.48
6,Support Vector,1.0,0.016


In [18]:
scores_df

Unnamed: 0,Classification Model,F1 Score,Recall Score
0,Decision Tree,0.723,0.584
1,Random Forest,0.854,0.514
2,Logistic Regression,0.713,0.475
3,K-Nearest Neighbors,0.585,0.407
4,Gaussian Naive Bayes,0.478,0.499
5,Stochastic Gradient Descent,0.891,0.345
6,Support Vector,1.0,0.016
