In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
df = pd.read_csv("ecommerce_data.csv")
df.head()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


In [33]:
df.shape

(500, 6)

In [34]:
A = df.values  # as_matrix method is deprecated. 
A

array([[1.        , 0.        , 0.65750995, 0.        , 3.        ,
        0.        ],
       [1.        , 1.        , 0.56857123, 0.        , 2.        ,
        1.        ],
       [1.        , 0.        , 0.042246  , 1.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.1728534 , 1.        , 3.        ,
        0.        ],
       [1.        , 0.        , 0.2099644 , 0.        , 3.        ,
        0.        ],
       [0.        , 0.        , 2.61688195, 1.        , 3.        ,
        0.        ]])

## Convert date column into 4 one-hot columns and normalize n_products_viewed and visit_duration

## Then split into testing and training sets.

In [35]:
def get_data():
    df = pd.read_csv("ecommerce_data.csv")
    data = df.values
    
    # shuffle it
    np.random.shuffle(data)
    
    # split features and labels
    X = data[:, :-1]  # Y is the last column
    Y = data[:, -1]
    
    # one-hot encode the categorical data
    # create a new matrix of zeroes X2 with the correct number of columns
    N, D = X.shape   # (100,5)
    X2 = np.zeros((N, D+3))  # (100,8)
    X2[:,0:(D-1)] = X[:,0:(D-1)] # first 3 cols of X2 are equivalent to first 3 cols of X  
    
    # one-hot for "is returning visitor", either 0 or 1
    for n in range(N):
        temp = int(X[n,D-1])  # 0, 1, 2, or 3
        # assigning 1 to either 4th, 5th, 6th, or 7th column (which are 5,6,7,8)
        X2[n,temp + D-1] = 1  # X2[n,D]=1 if temp=1, or X2[n,D-1]=1 if temp=0
    
    # assign X2 back to X, since we don't need original anymore
    X = X2

    # split train and test
    Xtrain = X[:-100]
    Ytrain = Y[:-100]
    Xtest = X[-100:]
    Ytest = Y[-100:]

    # normalize columns 1 and 2
    for i in (1, 2):
        m = Xtrain[:,i].mean()
        s = Xtrain[:,i].std()
        Xtrain[:,i] = (Xtrain[:,i] - m) / s
        Xtest[:,i] = (Xtest[:,i] - m) / s

    return Xtrain, Ytrain, Xtest, Ytest

In [36]:
Xtrain, Ytrain, Xtest, Ytest = get_data()
Ytrain[0:5]

array([0., 1., 1., 0., 2.])

## Now select only output 0 or 1

In [37]:
# for the logistics classification, we need to only return binary data

def get_binary_data():
    # return only the data from the first 2 classes
    Xtrain, Ytrain, Xtest, Ytest = get_data()
    X2train = Xtrain[Ytrain <= 1]
    Y2train = Ytrain[Ytrain <= 1]
    X2test = Xtest[Ytest <= 1]
    Y2test = Ytest[Ytest <= 1]
    return X2train, Y2train, X2test, Y2test

In [38]:
X2train, Y2train, X2test, Y2test = get_binary_data()
# there are only 398 observations where user action resulted in 0 or 1 
print(X2train.shape)
print(X2test.shape)

(321, 8)
(77, 8)


## Make predictions

In [39]:
X, Y, _, _ = get_binary_data()

# randomly initialize weights
D = X.shape[1]  # (500,8) => D=8
print("D:",D)
W = np.random.randn(D)
print("W:",W)
b = 0 # bias term

# make predictions
def sigmoid(a):
    return 1 / (1 + np.exp(-a))

def forward(X, W, b):
    return sigmoid(X.dot(W) + b)

P_Y_given_X = forward(X, W, b)
predictions = np.round(P_Y_given_X)

# calculate the accuracy
def classification_rate(Y, P):
    return np.mean(Y == P)

print("Score:", classification_rate(Y, predictions))

D: 8
W: [-0.83822068 -1.91705617  0.06426527  0.23047315 -1.79651951  0.17925194
 -1.25729148 -0.28822038]
Score: 0.24126984126984127
