<a href="https://colab.research.google.com/github/nikcook152/NFKPools/blob/main/Analytics_and_Big_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Insights

In [79]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime
Not connected to a GPU


#Setup

### Library Import 

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import warnings filter
from warnings import simplefilter
# ignore warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

### Data Import

In [48]:
data_ori = pd.read_csv('https://drive.google.com/u/0/uc?id=1KukM45VgI1kLQwtVdGvAuex7DT6TFf12&export=download')
#print(data)

## Preprocessing

### Norminal Features

#### Gender

In [49]:
data_ori = data_ori.replace({'F':0, 'M':1})

#### Occupation

In [51]:
occupation = pd.get_dummies(data_ori['Occupation'])           #create dummies
occupation = occupation.drop(columns = [20])                  #prevent perfect correlation occupation.columns[20], axis=1
data_ori = pd.concat([data_ori, occupation], axis=1)          #merge
data_ori = data_ori.drop(columns = ['Occupation'])            #delete old Occupation

#### City_Category

In [None]:
city_cat = pd.get_dummies(data_ori['City_Category'])        #create dummies
city_cat = city_cat.drop(columns = ['C'])                   #prevent perfect correlation
data_ori = pd.concat([data_ori, city_cat], axis=1)          #merge
data_ori = data_ori.drop(columns = ['City_Category'])       #delete old Occupation

#### Maritial_Status
This Feature is already correctly formatted, changing just the naming to make it more clear

In [None]:
data_ori = data_ori.rename(columns={'Marital_Status': 'Married'})

### Ordinal Features

#### Age

In [57]:
data_ori['Age'] = data_ori['Age'].replace({'0-17':  1, 
                                           '18-25': 2,
                                           '26-35': 3,
                                           '36-45': 4,
                                           '46-50': 5,
                                           '51-55': 6,
                                           '55+': 7})

#### Stay_In_Current_City_Years

In [72]:
data_ori['Stay_In_Current_City_Years'] = data_ori['Stay_In_Current_City_Years'].replace({'4+': 4})
data_ori['Stay_In_Current_City_Years'] = pd.to_numeric(data_ori['Stay_In_Current_City_Years'])      #turn strings into integer

### Other Preprocessing

#### Clean Product_ID

In [69]:
data_ori['Product_ID'] = data_ori['Product_ID'].replace({'P':''}, regex=True)   #Delete leading P's
data_ori['Product_ID'] = pd.to_numeric(data_ori['Product_ID'])                  #turn strings into integer

In [None]:
print(data_ori.head())
print(data_ori.dtypes)

# Other Rossbach stuff

In [78]:

print("---Data ori Shape---")
print(data_ori.shape)
# types
print("---Data ori DTypes---")
print(data_ori.dtypes)
# feature names
print("---Data ori Feature Names---")
print(list(data_ori))
# head
print("---Data ori Heads---")
print(data_ori.head(6))
# descriptions
print("---Data ori Descriptions---")
print(data_ori.describe())

# standardize data = (data_ori-data_ori.mean())/data_ori.std()
data=(data_ori-data_ori.min())/(data_ori.max()-data_ori.min())
print("---Standardizing---")
print(data.head(6))

X = data.drop('Purchase', axis = 1)
Y = data['Purchase']

from sklearn.model_selection import train_test_split
training_set_percentage = 0.01 #@param {type:"slider", min:0, max:1, step:0.01}
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1-training_set_percentage, random_state=0)
print("---Train Shape---")
print(Y_train.shape)
print("---Test Shape---")
print(Y_test.shape)
print(len(Y_test))

Y_train_mean = Y_train.mean()
print("Y_train_mean =", Y_train_mean)
Y_train_meandev = sum((Y_train-Y_train_mean)**2)
print("Y_train_meandev =", Y_train_meandev)
Y_test_meandev = sum((Y_test-Y_train_mean)**2)
print("Y_test_meandev =", Y_test_meandev)

# create report dataframe
report = pd.DataFrame(columns=['Model','R2.Train','R2.Test','R2_Mean_CV','R2_Std_CV'])

---Data ori Shape---
(550068, 32)
---Data ori DTypes---
User_ID                         int64
Product_ID                      int64
Gender                          int64
Age                             int64
Stay_In_Current_City_Years      int64
Married                         int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
0                               uint8
1                               uint8
2                               uint8
3                               uint8
4                               uint8
5                               uint8
6                               uint8
7                               uint8
8                               uint8
9                               uint8
10                              uint8
11                              uint8
12                              uint8
13                              uint8
14                              

# Models

### OLS

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, Y_train)
Y_train_pred = lm.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = lm.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['OLS RegressionCV', r2, pseudor2, "", ""]

# OLS with Cross Validation and Grid Search
from sklearn.linear_model import LinearRegression
lmCV = LinearRegression()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'fit_intercept':[True,False]
}
CV_olsmodel = GridSearchCV(estimator=lmCV, param_grid=param_grid, cv=10)
CV_olsmodel.fit(X_train, Y_train)
print(CV_olsmodel.best_params_)
lmCV = lmCV.set_params(**CV_olsmodel.best_params_)
lmCV.fit(X_train, Y_train)
Y_train_pred = lmCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = lmCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['OLS RegressionCV', r2, pseudor2,
           CV_olsmodel.cv_results_['mean_test_score'][CV_olsmodel.best_index_],
           CV_olsmodel.cv_results_['std_test_score'][CV_olsmodel.best_index_]]

R2 = 0.15527827172868602
Pseudo-R2 = 0.1551245275492117
{'fit_intercept': True}
R2 = 0.15527827172868602
Pseudo-R2 = 0.1551245275492117


### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=0.5)
ridgereg.fit(X_train, Y_train)
Y_train_pred = ridgereg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = ridgereg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)

# find best lambda (alphas)
from sklearn.linear_model import Ridge
ridgeregCV = Ridge()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'alpha': [25,10,4,3,2,1.0,0.8,0.5,0.3,0.2,0.1,0.05,0.02,0.01,0]
}
CV_rrmodel = GridSearchCV(estimator=ridgeregCV, param_grid=param_grid, cv=10)
CV_rrmodel.fit(X_train, Y_train)
print(CV_rrmodel.best_params_)
ridgeregCV = ridgeregCV.set_params(**CV_rrmodel.best_params_)
ridgeregCV.fit(X_train, Y_train)
Y_train_pred = ridgeregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = ridgeregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Ridge RegressionCV', r2, pseudor2,
           CV_rrmodel.cv_results_['mean_test_score'][CV_rrmodel.best_index_],
           CV_rrmodel.cv_results_['std_test_score'][CV_rrmodel.best_index_]]

R2 = 0.15527827167402486
Pseudo-R2 = 0.1551245398235217
{'alpha': 3}
R2 = 0.15527826976211256
Pseudo-R2 = 0.1551245995429037


### Support Vector Regression

In [None]:
# linear kernel
print("linear kernel")
from sklearn.svm import SVR
LinSVRreg = SVR(kernel='linear', C=1.0, epsilon=0.1)
LinSVRreg.fit(X_train, Y_train)
Y_train_pred = LinSVRreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = LinSVRreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)

# radial kernel
print("radial kernel")
RbfSVRreg = SVR(kernel='rbf', C=1.0, epsilon=0.1)
RbfSVRreg.fit(X_train, Y_train)
Y_train_pred = RbfSVRreg.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RbfSVRreg.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)

from sklearn.svm import SVR
RbfSVRregCV = SVR()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'kernel': ["rbf"],                                                          #["linear", "rbf"]
    'C': [1],                                                                   #[1, 3, 5, 8, 10] 
    'epsilon': [0.1],                                                           #[0.0, 0.025, 0.05, 0.075, 0.1]
    'gamma' : [0., 1.]                                                           #[0., 1., 2., 3., 4.]
}
CV_svrmodel = GridSearchCV(estimator=RbfSVRregCV, param_grid=param_grid, cv=10)
CV_svrmodel.fit(X_train, Y_train)
print(CV_svrmodel.best_params_)
RbfSVRregCV = RbfSVRregCV.set_params(**CV_svrmodel.best_params_)
RbfSVRregCV.fit(X_train, Y_train)
Y_train_pred = RbfSVRregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RbfSVRregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Support Vector RegressionCV', r2, pseudor2,
           CV_svrmodel.cv_results_['mean_test_score'][CV_svrmodel.best_index_],
           CV_svrmodel.cv_results_['std_test_score'][CV_svrmodel.best_index_]]

linear kernel
R2 = 0.13385207970627333


ValueError: ignored

## Neural Networks

In [None]:
from sklearn.neural_network import MLPRegressor
NNetRregCV = MLPRegressor(solver='lbfgs', random_state=0)
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'hidden_layer_sizes': [(5,), (8,), (10,), (13,)],
    'alpha': [0.0, 0.0025, 0.005, 0.0075, 0.01, 0.1],
    'activation': ["logistic", "tanh"]
}
CV_nnmodel = GridSearchCV(estimator=NNetRregCV, param_grid=param_grid, cv=10)
CV_nnmodel.fit(X_train, Y_train)
print(CV_nnmodel.best_params_)
NNetRregCV = NNetRregCV.set_params(**CV_nnmodel.best_params_)
NNetRregCV.fit(X_train, Y_train)
Y_train_pred = NNetRregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = NNetRregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Neural NetworkCV', r2, pseudor2,
           CV_nnmodel.cv_results_['mean_test_score'][CV_nnmodel.best_index_],
           CV_nnmodel.cv_results_['std_test_score'][CV_nnmodel.best_index_]]

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
RForregCV = RandomForestRegressor(random_state=0)
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'max_depth': [ 4.,  5.,  6.,  7.,  8.],
    'n_estimators': [ 10,  50,  100, 150, 200]
}
CV_rfmodel = GridSearchCV(estimator=RForregCV, param_grid=param_grid, cv=10)
CV_rfmodel.fit(X_train, Y_train)
print(CV_rfmodel.best_params_)
RForregCV = RForregCV.set_params(**CV_rfmodel.best_params_)
RForregCV.fit(X_train, Y_train)
Y_train_pred = RForregCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = RForregCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['Random ForestCV', r2, pseudor2,
           CV_rfmodel.cv_results_['mean_test_score'][CV_rfmodel.best_index_],
           CV_rfmodel.cv_results_['std_test_score'][CV_rfmodel.best_index_]]

{'max_depth': 10.0, 'n_estimators': 400}
R2 = 0.7568164674987872
Pseudo-R2 = 0.6740633040793843


## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knnmodel = KNeighborsRegressor(n_neighbors=7)
knnmodel.fit(X_train, Y_train)
Y_train_pred = knnmodel.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = knnmodel.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)

from sklearn.neighbors import KNeighborsRegressor
knnmodelCV = KNeighborsRegressor()
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_neighbors': range(3, 22, 2),
}
CV_knnmodel = GridSearchCV(estimator=knnmodelCV, param_grid=param_grid, cv=10)
CV_knnmodel.fit(X_train, Y_train)
print(CV_knnmodel.best_params_)
knnmodelCV = knnmodelCV.set_params(**CV_knnmodel.best_params_)
knnmodelCV.fit(X_train, Y_train)
Y_train_pred = knnmodelCV.predict(X_train)
Y_train_dev = sum((Y_train-Y_train_pred)**2)
r2 = 1 - Y_train_dev/Y_train_meandev
print("R2 =", r2)
Y_test_pred = knnmodelCV.predict(X_test)
Y_test_dev = sum((Y_test-Y_test_pred)**2)
pseudor2 = 1 - Y_test_dev/Y_test_meandev
print("Pseudo-R2 =", pseudor2)
report.loc[len(report)] = ['k-Nearest NeighborsCV', r2, pseudor2,
           CV_knnmodel.cv_results_['mean_test_score'][CV_knnmodel.best_index_],
           CV_knnmodel.cv_results_['std_test_score'][CV_knnmodel.best_index_]]

# Final Report

In [None]:
print(report)

NameError: ignored