In [1]:
## Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pickle
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm, skew
import warnings
warnings.filterwarnings('ignore')

In [2]:
## Read final.csv file
df = pd.read_csv('final.csv')

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Session,Clicks,Difference,Buys
0,0,1.0,4.0,0 days 00:05:51.000000000,
1,1,2.0,6.0,0 days 00:05:59.000000000,
2,2,3.0,3.0,0 days 00:12:26.000000000,
3,3,4.0,2.0,0 days 00:17:15.000000000,
4,4,6.0,2.0,0 days 00:04:06.000000000,
5,5,7.0,2.0,0 days 00:00:12.000000000,
6,6,8.0,2.0,0 days 00:02:14.000000000,
7,7,9.0,3.0,0 days 00:02:49.000000000,
8,8,11.0,12.0,0 days 00:13:04.000000000,2.0
9,9,12.0,2.0,0 days 00:02:59.000000000,1.0


In [4]:
## Drop column 'Unnamed: 0'
df = df.drop(['Unnamed: 0'], axis = 'columns')

In [5]:
## Keep only the time (characters 7-15) from column 'Difference'
df['Difference'] = df['Difference'].str[7:15]

In [6]:
df.dtypes

Session       float64
Clicks        float64
Difference     object
Buys          float64
dtype: object

In [7]:
df.Difference[1]

'00:05:59'

In [8]:
## Define a function that calculates the total seconds from a time string
def time_to_sec(time_str):
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [9]:
## Apply the time_to_sec function to column 'Difference'
df['Difference'] = df['Difference'].apply(time_to_sec)

In [10]:
## Calculate null values in dataframe
total = df.isnull().sum().sort_values(ascending = False)
missing_data = pd.DataFrame({'Total': total})
missing_data = missing_data[missing_data['Total'] > 0]
missing_data

Unnamed: 0,Total
Buys,8740033


In [11]:
## Store information of 'Buys' column in a new column as 'Total_Buys' & replace NaN values in "Total_Buys" column with 0
df['Total_Buys'] = df['Buys'].fillna(0)

In [12]:
## Transform column 'Buys' to binary where 1: buy, 0: no buy
df['Buys'] = df['Total_Buys']
df['Buys'] = df['Total_Buys'].astype(bool).astype(int)

In [13]:
df.head(10)

Unnamed: 0,Session,Clicks,Difference,Buys,Total_Buys
0,1.0,4.0,351,0,0.0
1,2.0,6.0,359,0,0.0
2,3.0,3.0,746,0,0.0
3,4.0,2.0,1035,0,0.0
4,6.0,2.0,246,0,0.0
5,7.0,2.0,12,0,0.0
6,8.0,2.0,134,0,0.0
7,9.0,3.0,169,0,0.0
8,11.0,12.0,784,1,2.0
9,12.0,2.0,179,1,1.0


In [14]:
df.dtypes

Session       float64
Clicks        float64
Difference      int64
Buys            int32
Total_Buys    float64
dtype: object

In [15]:
## Keep the final dataset as origin_df for later use
origin_df = df[['Clicks', 'Difference', 'Buys']]
## df will be used for creating train and test datasets for our models

In [16]:
origin_df.head(15)

Unnamed: 0,Clicks,Difference,Buys
0,4.0,351,0
1,6.0,359,0
2,3.0,746,0
3,2.0,1035,0
4,2.0,246,0
5,2.0,12,0
6,2.0,134,0
7,3.0,169,0
8,12.0,784,1
9,2.0,179,1


In [17]:
## Keep the target column in a seperate variable
y = df.Buys.values

In [18]:
## Drop the target column from the dataframe in order to split the data to train and test datasets
df.drop(['Buys', 'Total_Buys'], axis=1, inplace=True)

In [19]:
## Remove the column 'Session' - not needed in our analysis
df.drop(['Session'], axis=1, inplace=True)

In [20]:
df.head(5)

Unnamed: 0,Clicks,Difference
0,4.0,351
1,6.0,359
2,3.0,746
3,2.0,1035
4,2.0,246


In [21]:
## Import the necessary library
from sklearn.model_selection import train_test_split

In [22]:
## We used train_test_split function from Scikit-Learn library in order to split our data to train and test
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.33, random_state=42)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6197318, 2), (3052411, 2), (6197318,), (3052411,))

In [24]:
## Import the necessary libraries for our models
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
## pip install xgboost (Anaconda prompt)
import xgboost as xgb
## pip install lightgbm (Anaconda prompt)
import lightgbm as lgb

In [25]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(X_train,y_train)
pred = logreg.predict(X_test)

In [26]:
logreg.score(X_test, y_test)

0.9435652669316157

In [27]:
from sklearn.ensemble import RandomForestClassifier

randfor = RandomForestClassifier().fit(X_train,y_train)
pred = randfor.predict(X_test)

In [28]:
randfor.score(X_test, y_test)

0.9422813638137197

In [29]:
## Import the necessary libraries for our neural network model
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [30]:
# Set random seed for reproducibility of results
seed = 7
np.random.seed(seed)

In [31]:
origin_df.head(10)

Unnamed: 0,Clicks,Difference,Buys
0,4.0,351,0
1,6.0,359,0
2,3.0,746,0
3,2.0,1035,0
4,2.0,246,0
5,2.0,12,0
6,2.0,134,0
7,3.0,169,0
8,12.0,784,1
9,2.0,179,1


In [32]:
## Store the target variable in a new variable Y as an array
Y = origin_df.Buys.values

In [33]:
Y

array([0, 0, 0, ..., 0, 0, 0])

In [34]:
## Drop 'Buys' column from the dataset since we have kept the information in Y
origin_df.drop(['Buys'], axis=1, inplace=True)

In [35]:
origin_df.dtypes

Clicks        float64
Difference      int64
dtype: object

In [36]:
## X will be the input data to our neural network
X = origin_df.values

In [37]:
X

array([[4.000e+00, 3.510e+02],
       [6.000e+00, 3.590e+02],
       [3.000e+00, 7.460e+02],
       ...,
       [3.000e+00, 1.563e+03],
       [1.000e+00, 0.000e+00],
       [1.000e+00, 0.000e+00]])

In [38]:
## Create a baseline model
def nn_model():
    model = Sequential()
    model.add(Dense(2, input_dim = 2, kernel_initializer = 'normal', activation = 'relu'))
    model.add(Dense(1, kernel_initializer = 'normal', activation = 'sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer  ='adam', metrics = ['accuracy'])
    return model

In [None]:
## Evaluate model
estimator = KerasClassifier(build_fn = nn_model, epochs = 5, batch_size = 5, verbose = 0)
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
results = cross_val_score(estimator, X, Y, cv = kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
