# Online shopper


In [14]:
%matplotlib notebook

import os

import warnings
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

warnings.filterwarnings('ignore')

# Get data
Only run it when you haven't downloaded the data

In [15]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00468/online_shoppers_intention.csv"

def fetch_data(fname='online_shoppers_intention.csv'):
    """
    Helper method to retreive the ML Repository dataset.
    """
    response = requests.get(URL)
    outpath  = os.path.abspath(fname)
    with open(outpath, 'wb') as f:
        f.write(response.content)
    
    return outpath

# Fetch the data if required
DATA = fetch_data()

# Data exploration
If you already have the data, starts here

In [16]:
#read file
online = pd.read_csv('online_shoppers_intention.csv', ',')


In [20]:
online.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')

In [21]:
online.shape

(12330, 18)

In [22]:
online.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [9]:
#sns.distplot(online['age'], rug=True)

# Explore
Starts exploring data

In [24]:
online.groupby('Revenue')['Revenue'].count()

Revenue
False    10422
True      1908
Name: Revenue, dtype: int64

In [23]:
#check null
online.isna().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

# One Hot Encoding
One hot encoding features


In [17]:
one_hot_encoding_columns = ['VisitorType']
online = pd.get_dummies(data=online, columns=one_hot_encoding_columns)


In [27]:
online.head(3)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,False,False,0,0,1
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,False,False,0,0,1
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,False,False,0,0,1


# Binary features

In [18]:
#function to convert to 0 and 1
def convert_bool(row):
    row = row.replace('FALSE', 0)
    row = row.replace('TRUE', 1)
    return row


In [19]:
#update rows and convert to bool
colname = ['Weekend', 'Revenue']

#run fuction and convert to int
online[colname] = online[colname].apply(lambda col: convert_bool(col)).astype(int)

In [21]:
online.groupby('Revenue')['Revenue'].count()

Revenue
0    10422
1     1908
Name: Revenue, dtype: int64

In [32]:
online.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend',
       'Revenue', 'VisitorType_New_Visitor', 'VisitorType_Other',
       'VisitorType_Returning_Visitor'],
      dtype='object')

# Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [23]:
#adding G1 and G2, makes prediction so good. without it, they suck
X = online[['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend', 'VisitorType_New_Visitor', 'VisitorType_Other', 'VisitorType_Returning_Visitor']]

y = online['Revenue']

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Logistics Regression

In [24]:
lr = LogisticRegression(random_state=1, solver='liblinear')
# Fit the training data to the model
log_reg = lr.fit(X_train, y_train)
log_reg.score(X_test, y_test)


0.8678021086780211