In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dataset = pd.read_csv('online_shoppers_intention.csv')

In [3]:
dataset.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
dataset.shape

(12330, 18)

# Data Analysis

## Missing Values

In [6]:
any(dataset.isnull().sum())

False

Great !!! No missing values


## Datatypes
Lets see datatypes of our data.

Here target value is Revenue

In [9]:
dataset.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

## Check class imbalance


Class is slightly imbalance. Majority class is around 85%. And minority class is around 15%.

In [14]:
print(dataset.groupby('Revenue').size())

Revenue
False    10422
True      1908
dtype: int64


In [18]:
print(dataset.groupby('Revenue').size()*100/len(dataset))

Revenue
False    84.525547
True     15.474453
dtype: float64


## Check Correlation

* 0 indicates no linear relationship.
* +1 indicates a perfect positive linear relationship: as one variable increases in its values, the other variable also increases in its values via an exact linear rule.
* -1 indicates a perfect negative linear relationship: as one variable increases in its values, the other variable decreases in its values via an exact linear rule.
* Values between 0 and 0.3 (0 and -0.3) indicate a weak positive (negative) linear relationship via a shaky linear rule.
* Values between 0.3 and 0.7 (-0.3 and -0.7) indicate a moderate positive (negative) linear relationship via a fuzzy-firm linear rule.
* Values between 0.7 and 1.0 (-0.7 and -1.0) indicate a strong positive (negative) linear relationship via a firm linear rule.

In [85]:
corrDF=dataset.corr()
colmns=dataset.columns

In [86]:
newCorrDF=[]

for i in range(len(colmns)):
    for j in range(i+1,len(colmns)):
        try:
            rowName=colmns[i]
            colName=colmns[j]
            newCorrDF.append((rowName,colName,corrDF.loc[rowName,colName]))
        except:
            ;
            #print('Keyerror, found non numeric object')

In [87]:
newCorrDF=pd.DataFrame(newCorrDF)
newCorrDF.columns=['ColA','ColB','Correlation']

In [89]:
newCorrDF[newCorrDF.Correlation>0.7]

Unnamed: 0,ColA,ColB,Correlation
54,ProductRelated,ProductRelated_Duration,0.860927
75,BounceRates,ExitRates,0.913004


In [90]:
dataset[['ProductRelated','ProductRelated_Duration']]


Unnamed: 0,ProductRelated,ProductRelated_Duration
0,1,0.000000
1,2,64.000000
2,1,0.000000
3,2,2.666667
4,10,627.500000
5,19,154.216667
6,1,0.000000
7,0,0.000000
8,2,37.000000
9,3,738.000000


In [91]:

dataset[['BounceRates','ExitRates']]

Unnamed: 0,BounceRates,ExitRates
0,0.200000,0.200000
1,0.000000,0.100000
2,0.200000,0.200000
3,0.050000,0.140000
4,0.020000,0.050000
5,0.015789,0.024561
6,0.200000,0.200000
7,0.200000,0.200000
8,0.000000,0.100000
9,0.000000,0.022222


In [101]:
dataset=pd.get_dummies(dataset)

# Model Building

## Decision Tree

In [169]:
import sklearn
from sklearn import model_selection # for splitting into train and test
from sklearn.metrics import roc_curve, auc

from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn import tree # for decision tree

In [238]:
# Split-out validation dataset
X = dataset.drop(columns=['Revenue']).values
Y = dataset.Revenue.values
X_Columns=dataset.drop(columns=['Revenue']).columns
validation_size = 0.20
seed = 100
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [175]:
model_CART=sklearn.tree.DecisionTreeClassifier()

In [176]:
def DecisionTreeFit_Predict():
    model_CART.fit(X_train,Y_train)
    trainResult_CART=model_CART.predict(X_train) # Train Data Predict
    testResult_CART=model_CART.predict(X_test) # Test Data Predict
DecisionTreeFit_Predict()

In [177]:
def DecisionTreeCalculateResult():
    # Train Result

    print('Accuracy',sklearn.metrics.accuracy_score(Y_train, trainResult_CART))
    print('Precision',precision_score(Y_train,trainResult_CART))
    print('Recall',recall_score(Y_train,trainResult_CART))
    print('\n\nConfusion Matrix \n',sklearn.metrics.confusion_matrix(Y_train, trainResult_CART))
    print('\n\nClassification Report\n',sklearn.metrics.classification_report(Y_train, trainResult_CART))
    print('--------------------------------------------------------------')

    # Test Result
    print('Accuracy',sklearn.metrics.accuracy_score(Y_test, testResult_CART))
    print('Precision',precision_score(Y_test,testResult_CART))
    print('Recall',recall_score(Y_test,testResult_CART))
    print('\n\nConfusion Matrix \n',sklearn.metrics.confusion_matrix(Y_test, testResult_CART))
    print('\n\nClassification Report\n',sklearn.metrics.classification_report(Y_test, testResult_CART))

DecisionTreeCalculateResult()


Accuracy 0.9124087591240876
Precision 0.7914856646394439
Recall 0.593485342019544


Confusion Matrix 
 [[8089  240]
 [ 624  911]]


Classification Report
              precision    recall  f1-score   support

      False       0.93      0.97      0.95      8329
       True       0.79      0.59      0.68      1535

avg / total       0.91      0.91      0.91      9864

--------------------------------------------------------------
Accuracy 0.8969991889699919
Precision 0.7195571955719557
Recall 0.5227882037533512


Confusion Matrix 
 [[2017   76]
 [ 178  195]]


Classification Report
              precision    recall  f1-score   support

      False       0.92      0.96      0.94      2093
       True       0.72      0.52      0.61       373

avg / total       0.89      0.90      0.89      2466



Since Traing dataset has high Precision, Recall. And Test dataset has low precision (0.52) and low recall(0.56).

<u><b>The decision tree is Overfitting.</b></u>

Lets see what are the default paramters of Decision tree.

In [178]:
model_CART

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [179]:
model_CART=sklearn.tree.DecisionTreeClassifier(
    min_samples_split=100,
    min_samples_leaf=100,
    max_depth=10,
    min_impurity_decrease=10)

In [180]:
DecisionTreeFit_Predict()
DecisionTreeCalculateResult()

Accuracy 0.9124087591240876
Precision 0.7914856646394439
Recall 0.593485342019544


Confusion Matrix 
 [[8089  240]
 [ 624  911]]


Classification Report
              precision    recall  f1-score   support

      False       0.93      0.97      0.95      8329
       True       0.79      0.59      0.68      1535

avg / total       0.91      0.91      0.91      9864

--------------------------------------------------------------
Accuracy 0.8969991889699919
Precision 0.7195571955719557
Recall 0.5227882037533512


Confusion Matrix 
 [[2017   76]
 [ 178  195]]


Classification Report
              precision    recall  f1-score   support

      False       0.92      0.96      0.94      2093
       True       0.72      0.52      0.61       373

avg / total       0.89      0.90      0.89      2466



In [310]:
model_CART.tree_.n_classes

array([2])

## Random Forest

Since decision tree is overfitting, Lets try Random forest

In [181]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [184]:
model_RF = RandomForestClassifier(random_state=123,n_estimators=500)


In [204]:
def RandomForestFit_Predict():
    
    model_RF.fit(X_train,Y_train)
    global trainResult_RF
    global testResult_RF
    trainResult_RF = model_RF.predict(X_train) # Train Data Predict

    testResult_RF = model_RF.predict(X_test) # Test Data Predict

RandomForestFit_Predict()

In [205]:
def RandomForestCalculateResult():
    # Train Result

    print('Accuracy',sklearn.metrics.accuracy_score(Y_train, trainResult_RF))
    print('Precision',precision_score(Y_train,trainResult_RF))
    print('Recall',recall_score(Y_train,trainResult_RF))
    print('\n\nConfusion Matrix \n',sklearn.metrics.confusion_matrix(Y_train, trainResult_RF))
    print('\n\nClassification Report\n',sklearn.metrics.classification_report(Y_train, trainResult_RF))
    print('--------------------------------------------------------------')

    # Test Result
    print('Accuracy',sklearn.metrics.accuracy_score(Y_test, testResult_RF))
    print('Precision',precision_score(Y_test,testResult_RF))
    print('Recall',recall_score(Y_test,testResult_RF))
    print('\n\nConfusion Matrix \n',sklearn.metrics.confusion_matrix(Y_test, testResult_RF))
    print('\n\nClassification Report\n',sklearn.metrics.classification_report(Y_test, testResult_RF))

RandomForestCalculateResult()


Accuracy 1.0
Precision 1.0
Recall 1.0


Confusion Matrix 
 [[8329    0]
 [   0 1535]]


Classification Report
              precision    recall  f1-score   support

      False       1.00      1.00      1.00      8329
       True       1.00      1.00      1.00      1535

avg / total       1.00      1.00      1.00      9864

--------------------------------------------------------------
Accuracy 0.897404703974047
Precision 0.7142857142857143
Recall 0.5361930294906166


Confusion Matrix 
 [[2013   80]
 [ 173  200]]


Classification Report
              precision    recall  f1-score   support

      False       0.92      0.96      0.94      2093
       True       0.71      0.54      0.61       373

avg / total       0.89      0.90      0.89      2466



Try something by changing parameters

In [219]:
model_RF = RandomForestClassifier(n_estimators=1000,
                                  max_features=10,
                                  max_depth=10)
model_RF

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [220]:
RandomForestFit_Predict()
RandomForestCalculateResult()

Accuracy 0.9581305758313058
Precision 0.946656050955414
Recall 0.7745928338762215


Confusion Matrix 
 [[8262   67]
 [ 346 1189]]


Classification Report
              precision    recall  f1-score   support

      False       0.96      0.99      0.98      8329
       True       0.95      0.77      0.85      1535

avg / total       0.96      0.96      0.96      9864

--------------------------------------------------------------
Accuracy 0.8965936739659367
Precision 0.7048611111111112
Recall 0.5442359249329759


Confusion Matrix 
 [[2008   85]
 [ 170  203]]


Classification Report
              precision    recall  f1-score   support

      False       0.92      0.96      0.94      2093
       True       0.70      0.54      0.61       373

avg / total       0.89      0.90      0.89      2466



Decision tree and Random Forest both tends to overfit

## Logistic Regression

In [222]:
from sklearn import linear_model # for logistic model
model_LR=sklearn.linear_model.LogisticRegression()

In [223]:
model_LR=sklearn.linear_model.LogisticRegression()

In [224]:
def LogisticRegressionFit_Predict():
    
    model_LR.fit(X_train,Y_train)
    global trainResult_LR
    global testResult_LR
    trainResult_LR = model_LR.predict(X_train) # Train Data Predict
    testResult_LR = model_LR.predict(X_test) # Test Data Predict

LogisticRegressionFit_Predict()

In [225]:
def LogisticRegressionCalculateResult():
    # Train Result

    print('Accuracy',sklearn.metrics.accuracy_score(Y_train, trainResult_LR))
    print('Precision',precision_score(Y_train,trainResult_LR))
    print('Recall',recall_score(Y_train,trainResult_LR))
    print('\n\nConfusion Matrix \n',sklearn.metrics.confusion_matrix(Y_train, trainResult_LR))
    print('\n\nClassification Report\n',sklearn.metrics.classification_report(Y_train, trainResult_LR))
    print('--------------------------------------------------------------')

    # Test Result
    print('Accuracy',sklearn.metrics.accuracy_score(Y_test, testResult_LR))
    print('Precision',precision_score(Y_test,testResult_LR))
    print('Recall',recall_score(Y_test,testResult_LR))
    print('\n\nConfusion Matrix \n',sklearn.metrics.confusion_matrix(Y_test, testResult_LR))
    print('\n\nClassification Report\n',sklearn.metrics.classification_report(Y_test, testResult_LR))

LogisticRegressionCalculateResult()


Accuracy 0.8854420113544201
Precision 0.7560050568900126
Recall 0.38957654723127033


Confusion Matrix 
 [[8136  193]
 [ 937  598]]


Classification Report
              precision    recall  f1-score   support

      False       0.90      0.98      0.94      8329
       True       0.76      0.39      0.51      1535

avg / total       0.87      0.89      0.87      9864

--------------------------------------------------------------
Accuracy 0.8775344687753447
Precision 0.7005649717514124
Recall 0.3324396782841823


Confusion Matrix 
 [[2040   53]
 [ 249  124]]


Classification Report
              precision    recall  f1-score   support

      False       0.89      0.97      0.93      2093
       True       0.70      0.33      0.45       373

avg / total       0.86      0.88      0.86      2466



In [235]:
model_LR=sklearn.linear_model.LogisticRegression(class_weight={0:0.15,1:0.85},
                                                 penalty='l1')
model_LR

LogisticRegression(C=1.0, class_weight={0: 0.15, 1: 0.85}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [236]:
LogisticRegressionFit_Predict()
LogisticRegressionCalculateResult()

Accuracy 0.8610097323600974
Precision 0.536837376460018
Recall 0.7785016286644951


Confusion Matrix 
 [[7298 1031]
 [ 340 1195]]


Classification Report
              precision    recall  f1-score   support

      False       0.96      0.88      0.91      8329
       True       0.54      0.78      0.64      1535

avg / total       0.89      0.86      0.87      9864

--------------------------------------------------------------
Accuracy 0.8503649635036497
Precision 0.5037735849056604
Recall 0.7158176943699732


Confusion Matrix 
 [[1830  263]
 [ 106  267]]


Classification Report
              precision    recall  f1-score   support

      False       0.95      0.87      0.91      2093
       True       0.50      0.72      0.59       373

avg / total       0.88      0.85      0.86      2466



In [311]:
model_LR.coef_

array([[ 3.63683245e-02, -2.14120479e-04,  7.88250241e-02,
         4.94241603e-05,  3.81544483e-03,  4.17281379e-05,
         0.00000000e+00, -7.98036500e+00,  1.28369079e-01,
        -1.45253270e-01, -1.36308604e-01,  3.00480549e-02,
        -2.17252642e-02,  7.87455810e-04,  9.21215307e-02,
         0.00000000e+00, -4.13325419e-01, -9.89843677e-01,
         1.94449873e-01, -3.11093572e-02, -4.44523578e-01,
        -6.00711361e-01,  7.92393686e-01,  6.21925801e-02,
         3.97300521e-01,  0.00000000e+00, -1.28995830e-01,
        -4.39408918e-01]])

In [301]:
model_LR.intercept_

array([-0.48119388])