In [1]:
import pandas as pd
import numpy as np

In [2]:
## quick data clean up

df = pd.read_csv("shopping.csv")
print(df.head())
print(df['VisitorType'].unique())

   Administrative  Administrative_Duration  Informational  \
0               3               142.500000              0   
1               6               437.391304              2   
2               1                41.125000              0   
3               2               141.000000              0   
4              18               608.140000              6   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                    0.00              48              1052.255952   
1                  235.55              83              2503.881781   
2                    0.00             126              4310.004668   
3                    0.00              10               606.666667   
4                  733.80             168              4948.398759   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0     0.004348   0.013043    0.000000         0.0   Nov                 1   
1     0.002198   0.004916    2.086218         0.0   Mar   

In [3]:
# stuff to transform strings into text

months = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "June": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
}

visitor_types = {
    "Returning_Visitor": 1,
    "New_Visitor": 2,
    "Other": 3
}

In [4]:
df['Month'] = df['Month'].map(months)
df['VisitorType'] = df['VisitorType'].map(visitor_types)

# cool trick, thanks stackoverflow. this turns bools into 1s and 0s
# https://stackoverflow.com/questions/17383094/how-can-i-map-true-false-to-1-0-in-a-pandas-dataframe
# df["Weekend"] = df["Weekend"].astype(int)
# df["Revenue"] = df["Revenue"].astype(int)

print(df.head(3))

   Administrative  Administrative_Duration  Informational  \
0               3               142.500000              0   
1               6               437.391304              2   
2               1                41.125000              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                    0.00              48              1052.255952   
1                  235.55              83              2503.881781   
2                    0.00             126              4310.004668   

   BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \
0     0.004348   0.013043    0.000000         0.0     11                 1   
1     0.002198   0.004916    2.086218         0.0      3                 2   
2     0.000688   0.012823    3.451072         0.0     11                 2   

   Browser  Region  TrafficType  VisitorType  Weekend  Revenue  
0        8       6           11            1    False    False  
1        2       3            2

### Now our data is formatted, let's get silly

We are here to predict the *revenue* column, as if it is true (1) than that means the user made a purchase. Predict using linaer regression 
Lets start by implementing funcitons we're familiar with, and that are specified 

We will start with the building block functions, normalization is a good place to start, as there are some data that is kinda wacky. 


In [8]:
"""
These are the three common scaling methods used in machine learning. (also the three specificed in the assigment lolz)
Scaling is important as it not only helps in speeding up the training process, but also bias (overfitting) relative to a feature. 
"""

# pep 3107 i love you
def z_score_scaling(X: np.array) -> np.array:
    """
    z_score_scaling takes in an array of features, and returns a scaled version using the Z-score scaling formula.
    This will return an array with each feature with mean of 0 and standard deviation of 1.
    """
    # initialize arrays for means and standard deviations for each column
    means = np.mean(X)
    stds = np.std(X)    # standard deviation

    # apply the z-score scaling formula
    X_scaled = (X - means) / stds
    return X_scaled

def min_max_scaling(X: np.array) -> np.array:
    """
    min_max_scaling takes in an array of features, and returns a scaled version using the min-max scaling formula.
    This will return an array with each feature between 0 and 1, with the previous lowest being 0, and the previous highest being 1.
    """
    # get mins and maxes for each column
    mins = np.min(X)
    maxes = np.max(X)
    
    # apply the min-max scaling formula
    X_scaled = (X - mins) / (maxes - mins)
    
    return X_scaled

def mean_normalization(X):
    """
    mean_normalization takes in an array of features, and returns a scaled version using the mean normalization formula.
    This will return an array with each feature scaled realtve to the mean of the feature.
    """
    
    # get the means
    means = np.mean(X)
    
    #subtract the means from each feature
    X_scaled = X - means
    return X_scaled


# get some info to determine 
print(df.describe())

       Administrative  Administrative_Duration  Informational  \
count     5000.000000              5000.000000    5000.000000   
mean         2.295000                79.828436       0.505600   
std          3.329954               178.029543       1.303652   
min          0.000000                 0.000000       0.000000   
25%          0.000000                 0.000000       0.000000   
50%          1.000000                 6.000000       0.000000   
75%          4.000000                93.700000       0.000000   
max         24.000000              3398.750000      24.000000   

       Informational_Duration  ProductRelated  ProductRelated_Duration  \
count             5000.000000     5000.000000              5000.000000   
mean                35.912869       31.885000              1197.255606   
std                151.439339       45.036099              2100.700466   
min                  0.000000        0.000000                 0.000000   
25%                  0.000000        7.00000

In [18]:
# list of columns to normalize
# we chose to normalize all the columns that are not binary or categorical
columns_to_normalize = [
    'Administrative',
    'Administrative_Duration',
    'Informational',
    'Informational_Duration',
    'ProductRelated',
    'ProductRelated_Duration',
    'BounceRates',
    'ExitRates',
    'PageValues',
    'SpecialDay'
]

# apply scaling to the selected columns

# !!!! this is the part where we can change what normalization method we want to use, as specificed in the writeup
for column in columns_to_normalize:
    df[column] = z_score_scaling(df[column])

# normalized, lets go
print(df.head(3))

   Administrative  Administrative_Duration  Informational  \
0        0.211736                 0.352064      -0.387872   
1        1.112739                 2.008648       1.146433   
2       -0.388933                -0.217421      -0.387872   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0               -0.237167        0.357860                -0.069031   
1                1.318397        1.135092                 0.622058   
2               -0.237167        2.089977                 1.481915   

   BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \
0    -0.376478  -0.619388   -0.314427   -0.313118     11                 1   
1    -0.419455  -0.782413   -0.200199   -0.313118      3                 2   
2    -0.449638  -0.623814   -0.125468   -0.313118     11                 2   

   Browser  Region  TrafficType  VisitorType  Weekend  Revenue  
0        8       6           11            1    False    False  
1        2       3            2

# now our data is normalized and ready to go, lets start moving into the ml aspect

we want to predict the revnue in the y, so lets start splitting there 

In [19]:
X = df.drop('Revenue', axis=1) # axis = 1 drops column, axis = 0 drops row
y = df['Revenue'] 

# we can split into training and testing sets here, i dont think we have to cross validate so 80/20 split, where 80% is training and 20% is testing
split_80 = int(len(X) * 0.8)
X_train = X[:split_80]
x_test = X[split_80:]

y_train = y[:split_80]
y_test = y[split_80:]