In [173]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense


In [174]:
## quick data clean up

df = pd.read_csv("shopping.csv")
print(df["VisitorType"].unique())
print(df.describe())


['Returning_Visitor' 'New_Visitor' 'Other']
       Administrative  Administrative_Duration  Informational  \
count     5000.000000              5000.000000    5000.000000   
mean         2.295000                79.828436       0.505600   
std          3.329954               178.029543       1.303652   
min          0.000000                 0.000000       0.000000   
25%          0.000000                 0.000000       0.000000   
50%          1.000000                 6.000000       0.000000   
75%          4.000000                93.700000       0.000000   
max         24.000000              3398.750000      24.000000   

       Informational_Duration  ProductRelated  ProductRelated_Duration  \
count             5000.000000     5000.000000              5000.000000   
mean                35.912869       31.885000              1197.255606   
std                151.439339       45.036099              2100.700466   
min                  0.000000        0.000000                 0.000000   


In [175]:
# stuff to transform strings into text
# described in the assignment

months = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "June": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
}

visitor_types = {
    "Returning_Visitor": 1,
    "New_Visitor": 2,
    "Other": 3
}

In [176]:
df["Month"] = df["Month"].map(months)
df["VisitorType"] = df["VisitorType"].map(visitor_types)

# super cool way to conver bools to ints 
# https://stackoverflow.com/questions/17383094/how-can-i-map-true-false-to-1-0-in-a-pandas-dataframe
df["Weekend"] = df["Weekend"].astype(int)
df["Revenue"] = df["Revenue"].astype(int)

print(df.head(3))

# visual 
# commented bc this takes a while to run, like 90 seconds
# sns.pairplot(df, hue="Revenue")

   Administrative  Administrative_Duration  Informational  \
0               3               142.500000              0   
1               6               437.391304              2   
2               1                41.125000              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                    0.00              48              1052.255952   
1                  235.55              83              2503.881781   
2                    0.00             126              4310.004668   

   BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \
0     0.004348   0.013043    0.000000         0.0     11                 1   
1     0.002198   0.004916    2.086218         0.0      3                 2   
2     0.000688   0.012823    3.451072         0.0     11                 2   

   Browser  Region  TrafficType  VisitorType  Weekend  Revenue  
0        8       6           11            1        0        0  
1        2       3            2

### Now our data is formatted, let"s get silly

We are here to predict the *revenue* column, as if it is true (1) than that means the user made a purchase. Predict using linaer regression 
Lets start by implementing funcitons we"re familiar with, and that are specified 

We will start with the building block functions, normalization is a good place to start, as there are some data that is kinda wacky. 


In [177]:
"""
These are the three common scaling methods used in machine learning. (also the three specificed in the assigment lolz)
Scaling is important as it not only helps in speeding up the training process, but also bias (overfitting) relative to a feature. 
"""

# pep 3107 i love you
def z_score_scaling(X: pd.Series) -> pd.Series:
    """
    z_score_scaling takes in an array of features, and returns a scaled version using the Z-score scaling formula.
    this will return an array with each feature with mean of 0 and standard deviation of 1.
    """
    # initialize arrays for means and standard deviations for each column
    means = np.mean(X)
    stds = np.std(X)    # standard deviation

    # apply the z-score scaling formula
    X_scaled = (X - means) / stds
    return X_scaled

def min_max_scaling(X: pd.Series) -> pd.Series:
    """
    min_max_scaling takes in an array of features, and returns a scaled version using the min-max scaling formula.
    this will return an array with each feature between 0 and 1, with the previous lowest being 0, and the previous highest being 1.
    """
    # get mins and maxes for each column
    mins = np.min(X)
    maxes = np.max(X)
    
    # apply the min-max scaling formula
    X_scaled = (X - mins) / (maxes - mins)
    
    return X_scaled

def mean_normalization(X: pd.Series) -> pd.Series:
    """
    mean_normalization takes in an array of features, and returns a scaled version using the mean normalization formula.
    this will return an array with each feature scaled realtve to the mean of the feature.
    """
    
    # get the means
    means = np.mean(X)
    
    #subtract the means from each feature
    X_scaled = X - means
    return X_scaled


def normalize_data(df: pd.DataFrame, columns: list, normMethod: str) -> pd.DataFrame:
    """
    normalize_data takes in a dataframe and a list of columns to normalize, and returns a normalized dataframe.
    """
    if normMethod == "z":
        for column in columns:
            df[column] = z_score_scaling(df[column])
        return df
    elif normMethod == "mean":
        for column in columns:
            df[column] = mean_normalization(df[column])
        return df
    elif normMethod == "min_max":
        for column in columns:
            df[column] = min_max_scaling(df[column])
        return df
    else:
        print("Invalid normalization method doofus")

In [178]:
# list of columns to normalize
# were gonna normalize everything thats not binary (as stated in class)
columns_to_normalize = [
    "Administrative",
    "Administrative_Duration",
    "Informational",
    "Informational_Duration",
    "ProductRelated",
    "ProductRelated_Duration",
    "BounceRates",
    "ExitRates",
    "PageValues",
    "SpecialDay",
    "Browser",
    "Region",
    "TrafficType",
    "Month", # do we need to normalize this guy?
    "OperatingSystems"
]

print(df.head())
print("-----------------")
## THIS IS THE PART WHERE WE NORMALIZE THE DATA
# df = normalize_data(df, columns_to_normalize, "min_max") # z, mean, or min_max
print(df.head(3))

# https://stackoverflow.com/questions/24761998/pandas-compute-z-score-for-all-columns
# z-score scaling using pandas
# really had to review what scaling vs normalization was
# df_zscore = (df - df.mean())/df.std()x  


   Administrative  Administrative_Duration  Informational  \
0               3               142.500000              0   
1               6               437.391304              2   
2               1                41.125000              0   
3               2               141.000000              0   
4              18               608.140000              6   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                    0.00              48              1052.255952   
1                  235.55              83              2503.881781   
2                    0.00             126              4310.004668   
3                    0.00              10               606.666667   
4                  733.80             168              4948.398759   

   BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \
0     0.004348   0.013043    0.000000         0.0     11                 1   
1     0.002198   0.004916    2.086218         0.0      3

### Quick code snippets to help build our model

In [179]:

## following taken from logistic regression assignment

def sigmoid(z: np.ndarray) -> np.ndarray:
    """
    sigmoid takes in a numpy array and applies the sigmoid function to each element.
    """
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, w, b, *argv):
    """
    Computes the cost over all examples
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value 
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model
      *argv : unused, for compatibility with regularized version below
    Returns:
      total_cost : (scalar) cost 
    """

    m, n = X.shape
    
    ### START CODE HERE ###
   
   # start with the sigmoid of the dot product of X and w + b
    y_hat = sigmoid(np.dot(X, w) + b)
    
    # loop though the number of examples
    # np.sum wil do a lot of the work
    total_cost =  np.sum(-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat))
                
    total_cost = total_cost/m
    ### END CODE HERE ### 
    
    return total_cost

    

def predict(X, w, b): 
    """
    Predict whether the label is 0 or 1 using learned logistic
    regression parameters w
    
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model

    Returns:
      p : (ndarray (m,)) The predictions for X using a threshold at 0.5
    """
    # number of training examples
    m, n = X.shape   
    p = np.zeros(m)
   
    ### START CODE HERE ### 
    # Loop over each example
    for i in range(m):   
        z_wb = 0 # reset z_wb for each example 
        # Loop over each feature
        for j in range(n): 
            # Add the corresponding term to z_wb
            z_wb += X[i][j] * w[j]
        
        # Add bias term 
        z_wb += b
        
        # Calculate the prediction for this example
        f_wb = sigmoid(z_wb)

        # Apply the threshold 
        #p[i] = 1 if f_wb >= 0.5 else 0, i dont like that readiblity though
        if f_wb >= 0.5:
            p[i] = 1
        else:
            p[i] = 0
        
    ### END CODE HERE ### 
    return p


def compute_gradient(X, y, w, b, *argv): 
    """
    Computes the gradient for logistic regression 
 
    Args:
      X : (ndarray Shape (m,n)) data, m examples by n features
      y : (ndarray Shape (m,))  target value 
      w : (ndarray Shape (n,))  values of parameters of the model      
      b : (scalar)              value of bias parameter of the model
      *argv : unused, for compatibility with regularized version below
    Returns
      dj_dw : (ndarray Shape (n,)) The gradient of the cost w.r.t. the parameters w. 
      dj_db : (scalar)             The gradient of the cost w.r.t. the parameter b. 
    """
    m, n = X.shape
    dj_dw = np.zeros(w.shape)
    dj_db = 0.

    ### START CODE HERE ### 
    # loop through the number of examples
    for i in range(m):
    
        #start with the dot product of X and w + b
        z_wb = np.dot(X[i], w) + b
        
        # get the sigmoid of z_wb, which is our prediction
        f_wb = sigmoid(z_wb) 

        # update gradient of the cost function with respect to w
        # note the f_wb - y[i] is the error
        for j in range(n):
            dj_dw[j] += (f_wb - y[i]) * X[i][j]
        
        # finally dj_db_i is the error and dj_db is the sum of all the errors
        dj_db_i = f_wb - y[i]
        dj_db += dj_db_i
        

            
            
    #average
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    ### END CODE HERE ###


        
    return dj_db, dj_dw    

# now our data is ready to be normalized lets start moving into the ml aspect

we want to predict the revnue in y, so lets start splitting there 

In [180]:
import math

def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, lambda_): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X :    (ndarray Shape (m, n) data, m examples by n features
      y :    (ndarray Shape (m,))  target value 
      w_in : (ndarray Shape (n,))  Initial values of parameters of the model
      b_in : (scalar)              Initial value of parameter of the model
      cost_function :              function to compute cost
      
      :          function to compute gradient
      alpha : (float)              Learning rate
      num_iters : (int)            number of iterations to run gradient descent
      lambda_ : (scalar, float)    regularization constant
      
    Returns:
      w : (ndarray Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """
    
    # number of training examples
    m = len(X)
    
    # An array to store cost J and w"s at each iteration primarily for graphing later
    J_history = []
    w_history = []
    
    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db, dj_dw = gradient_function(X, y, w_in, b_in, lambda_)   

        # Update Parameters using w, b, alpha and gradient
        w_in = w_in - alpha * dj_dw               
        b_in = b_in - alpha * dj_db              
       
        # Save cost J at each iteration
        if i<100000:      # prevent resource exhaustion 
            cost =  cost_function(X, y, w_in, b_in, lambda_)
            J_history.append(cost)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
            w_history.append(w_in)
            print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   ")
        
    return w_in, b_in#return w and J,w history for graphing

### Cool, so all the functions are in order, lets do the actual training part.

I just re ran the entire notebook three different tmes, and added a new line to my spreadsheet each time :p no need to reinvent the wheel

In [181]:
# # make sure eveyrhting is an int quickly
# df = df.astype(int)

# # split into target and featuresr
# y = df["Revenue"]
# X = df.drop("Revenue", axis=1) # axis = 1 drops column, axis = 0 drops row


# # we can split into training and testing sets here, no cross validation so 80/20 split, where 80% is training and 20% is testing
# split_80 = int(len(X) * 0.8)
# X_train = X[:split_80].values
# X_test = X[split_80:].values

# y_train = y[:split_80].values
# y_test = y[split_80:].values

# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)


# np.random.seed(1)
# initial_w = 0.01 * (np.random.rand(X_train.shape[1]) - 0.5)
# initial_b = -8

# # Some gradient descent settings
# iterations = 10000
# alpha = 0
# lambda_ = 0 # we'll run 3 times with no lambda, and then thrice with lambda. each time with a different norm 

# # convert to numpy arrays, which allows for the dot product to actually happen (?) 
# # some silly stack overflow answer told me to do this
# n12 = np.squeeze(np.asarray(X_train))
# X12 = np.squeeze(np.asarray(y_train))

# w, b = gradient_descent(X_train, y_train, initial_w, initial_b, 
#                                       compute_cost, compute_gradient, alpha, iterations, lambda_)

# # figure out the accuracy of the model
# predictions = predict(X_test, w, b)
# correct = 0
# for i in range(len(predictions)):
#     if predictions[i] == y_test[i]:
#         correct += 1
 

# print(correct / len(predictions)) # accuracy of the model
# print(f"amount correct : {correct}")

# # god damn    
# # https://www.youtube.com/watch?v=NV9bGLBuJs8 check out goose theyre a great band

### everything else was going roughly  above this is the second option VVVVVV

In [182]:


# well make a sequential model, notably last layer has 1 neuron at sigmoid activation
shopping_model = Sequential()
shopping_model.add(Dense(16, input_dim=17, activation="relu")),
shopping_model.add(Dense(8, activation="relu")),
shopping_model.add(Dense(1, activation="sigmoid"))


# adam optimizer
adam = keras.optimizers.Adam(learning_rate=0.001)

# compile the model
shopping_model.compile(loss="mse", optimizer=adam, metrics=["accuracy"])

# and then fit the model
shopping_model.fit(X_train, y_train, epochs=100, verbose=1)

# this predicts the values of x test
predicted = shopping_model.predict(X_test)

# get the accuracy of the model
loss_and_metrics = shopping_model.evaluate(X_test, y_test)
print(loss_and_metrics)
print('Loss = ',loss_and_metrics[0])
print('Accuracy = ',loss_and_metrics[1])

correct = 0
for i in range(len(predicted)):
    # using a threshold of .5
    if predicted[i] >= 0.5:
        predicted[i] = 1
    else:
        predicted[i] = 0
        
        
    if predicted[i] == y_test[i]:
        correct += 1
print(f"correct = {correct}")

# this is so much easier than the hard way



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78