In [1]:
import pandas as pd
import numpy as np
import seaborn as sns # i like seaborn a bit more than matplotlib, easier to me
import tensorflow as tf



In [2]:
## quick data clean up

df = pd.read_csv("shopping.csv")
print(df.head())
print(df['VisitorType'].unique())


   Administrative  Administrative_Duration  Informational  \
0               3               142.500000              0   
1               6               437.391304              2   
2               1                41.125000              0   
3               2               141.000000              0   
4              18               608.140000              6   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                    0.00              48              1052.255952   
1                  235.55              83              2503.881781   
2                    0.00             126              4310.004668   
3                    0.00              10               606.666667   
4                  733.80             168              4948.398759   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0     0.004348   0.013043    0.000000         0.0   Nov                 1   
1     0.002198   0.004916    2.086218         0.0   Mar   

In [3]:
# stuff to transform strings into text

months = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "June": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
}

visitor_types = {
    "Returning_Visitor": 1,
    "New_Visitor": 2,
    "Other": 3
}

In [4]:
df['Month'] = df['Month'].map(months)
df['VisitorType'] = df['VisitorType'].map(visitor_types)

# cool trick, thanks stackoverflow. this turns bools into 1s and 0s
# https://stackoverflow.com/questions/17383094/how-can-i-map-true-false-to-1-0-in-a-pandas-dataframe
# df["Weekend"] = df["Weekend"].astype(int)
# df["Revenue"] = df["Revenue"].astype(int)

print(df.head(3))

# visual 
# commented bc this takes a while to run, like 90 seconds
# sns.pairplot(df, hue="Revenue")

   Administrative  Administrative_Duration  Informational  \
0               3               142.500000              0   
1               6               437.391304              2   
2               1                41.125000              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                    0.00              48              1052.255952   
1                  235.55              83              2503.881781   
2                    0.00             126              4310.004668   

   BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \
0     0.004348   0.013043    0.000000         0.0     11                 1   
1     0.002198   0.004916    2.086218         0.0      3                 2   
2     0.000688   0.012823    3.451072         0.0     11                 2   

   Browser  Region  TrafficType  VisitorType  Weekend  Revenue  
0        8       6           11            1    False    False  
1        2       3            2

### Now our data is formatted, let's get silly

We are here to predict the *revenue* column, as if it is true (1) than that means the user made a purchase. Predict using linaer regression 
Lets start by implementing funcitons we're familiar with, and that are specified 

We will start with the building block functions, normalization is a good place to start, as there are some data that is kinda wacky. 


In [5]:
"""
These are the three common scaling methods used in machine learning. (also the three specificed in the assigment lolz)
Scaling is important as it not only helps in speeding up the training process, but also bias (overfitting) relative to a feature. 
"""

# pep 3107 i love you
def z_score_scaling(X: np.array) -> np.array:
    """
    z_score_scaling takes in an array of features, and returns a scaled version using the Z-score scaling formula.
    This will return an array with each feature with mean of 0 and standard deviation of 1.
    """
    # initialize arrays for means and standard deviations for each column
    means = np.mean(X)
    stds = np.std(X)    # standard deviation

    # apply the z-score scaling formula
    X_scaled = (X - means) / stds
    return X_scaled

def min_max_scaling(X: np.array) -> np.array:
    """
    min_max_scaling takes in an array of features, and returns a scaled version using the min-max scaling formula.
    This will return an array with each feature between 0 and 1, with the previous lowest being 0, and the previous highest being 1.
    """
    # get mins and maxes for each column
    mins = np.min(X)
    maxes = np.max(X)
    
    # apply the min-max scaling formula
    X_scaled = (X - mins) / (maxes - mins)
    
    return X_scaled

def mean_normalization(X: np.array) -> np.array:
    """
    mean_normalization takes in an array of features, and returns a scaled version using the mean normalization formula.
    This will return an array with each feature scaled realtve to the mean of the feature.
    """
    
    # get the means
    means = np.mean(X)
    
    #subtract the means from each feature
    X_scaled = X - means
    return X_scaled


# get some info to determine 
# print(df.describe())

In [6]:
# list of columns to normalize
columns_to_normalize = [
    'Administrative',
    'Administrative_Duration',
    'Informational',
    'Informational_Duration',
    'ProductRelated',
    'ProductRelated_Duration',
    'BounceRates',
    'ExitRates',
    'PageValues',
    'SpecialDay'
]

# now our data is ready to be normalized lets start moving into the ml aspect

we want to predict the revnue in y, so lets start splitting there 

In [7]:
# mean squared error, gives us the loss
# implementing here as to not make the next cell too long 
def mean_squared_error(act: np.array, pred: np.array) -> float:

   diff = pred - act
   differences_squared = diff ** 2
   mean_diff = differences_squared.mean()
   
   return mean_diff


# we want to determine wether the prediction is confident enough, using this threshold
def accuracy(y_pred: np.array, y_test: np.array, threshold=0.5) -> float:

    # if the prediction is greater than the threshold, default .5, it is a 1, else a 0
    # https://stackoverflow.com/questions/43672047/convert-probability-vector-into-target-vector-in-python
    y_pred = np.where(y_pred > threshold, 1, 0)
    
    correct_prediction = np.sum(y_pred == y_test)
    total_prediction = len(y_test)
    
    return correct_prediction / total_prediction
    
    
    
    


In [10]:
# make sure eveyrhting is an int quickly
df = df.astype(int)

# split into target and featuresr
y = df['Revenue'] 
X = df.drop('Revenue', axis=1) # axis = 1 drops column, axis = 0 drops row

# scale the data using one of the three defined scaling methods
# !!! this is where we can change for later testing
X = min_max_scaling(X)


# we can split into training and testing sets here, i dont think we have to cross validate so 80/20 split, where 80% is training and 20% is testing
split_80 = int(len(X) * 0.8)
X_train = X[:split_80]
X_test = X[split_80:]

Y_train = y[:split_80]
Y_test = y[split_80:]

# print(X_train.shape, Y_train.shape) # 80% of 5000 samples is 4000


# define the model
model = tf.keras.Sequential([
    
    # units 1 because we are predicting a binary value (the revenue, binary)
    # input dim gives the number of features
    # and we are using a linear activation function
    
    tf.keras.layers.Dense(1, input_dim=X_train.shape[1], activation='linear')
])
model.summary()

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, Y_train, epochs=100, batch_size=32, validation_split=0.2)
model.predict(X_test)

# Evaluate the model
loss = model.evaluate(X_test, Y_test)
print(f'Test Loss: {loss}')

# import matplotlib.pyplot as plt
# plt.plot(history.history['loss'], label='Training Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.title('Training and Validation Loss Over Epochs')
# plt.show()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 1)                 18        
                                                                 
Total params: 18 (72.00 Byte)
Trainable params: 18 (72.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/10

In [9]:

# # make sure eveyrhting is an int quickly
# df = df.astype(int)

# # split into target and featuresr
# y = df['Revenue'] 
# X = df.drop('Revenue', axis=1) # axis = 1 drops column, axis = 0 drops row


# # we can split into training and testing sets here, i dont think we have to cross validate so 80/20 split, where 80% is training and 20% is testing
# split_80 = int(len(X) * 0.8)
# X_train = X[:split_80]
# X_test = X[split_80:]

# Y_train = y[:split_80]
# Y_test = y[split_80:]

# def manual_standardize(X_train, X_test):
#     # Calculate mean and standard deviation from the training set
#     mean = np.mean(X_train, axis=0)
#     std = np.std(X_train, axis=0)

#     # Standardize the training and test sets
#     X_train_standardized = (X_train - mean) / std
#     X_test_standardized = (X_test - mean) / std

#     return X_train_standardized, X_test_standardized

# X_train, X_test = manual_standardize(X_train, X_test)


# def linear_regression(X_train: np.array, Y_train: np.array, X_test: np.array,
#                       learning_rate: float, iterations: int, regularization_param: float) -> np.array:  
     
#     m, n = X_train.shape # m samples, n features
#     weights = np.zeros(n) # initalize weights to zeros
#     bias = 0 # the b in wx + b, bias is universal 

#     for iteration in range(iterations):
#         # calculate predictions
#         y_pred = np.dot(X_train, weights) + bias

#         # calculate gradients
#         gradient_weights = (1 / m) * np.dot(X_train.T, (y_pred - Y_train)) + (regularization_param / m) * weights
#         gradient_bias = (1 / m) * np.sum(y_pred - Y_train)

#         # update weights and bias
#         weights -= learning_rate * gradient_weights
#         bias -= learning_rate * gradient_bias

#         # calculate and print the loss
#         loss = (1 / (2 * m)) * np.sum((y_pred - Y_train) ** 2) + (regularization_param / (2 * m)) * np.sum(weights ** 2)
#         print(f"Iteration {iteration}: Loss = {loss}")

#     # Make predictions on the test set
#     y_pred_test = np.dot(X_test, weights) + bias

#     return y_pred_test

# # Adjust learning rate and regularization parameter
# learning_rate = 0.0001
# regularization_param = 0.01
# iteartions = 1000

# y_pred_test = linear_regression(X_train, Y_train, X_test,learning_rate, iteartions, regularization_param)

# # we can interpert the results as the probability of a purchase
# def accuracy(y_pred, y_test, threshold=0.5):
#     # our threshold in 50%, so anything above 50% is a purchase to our model
#     y_pred_binary = (y_pred >= threshold).astype(int)
    
#     # calculate accuracy

#     return np.mean(y_pred_binary == y_test), y_pred_binary

# x, y = accuracy(y_pred_test, Y_test)
# print(f"Accuracy: {x}")
# # visual
# print(y_pred_test)