# Prediction

In [None]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *

from sklearn.linear_model import LogisticRegression # Logistic Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv
from sklearn.model_selection import cross_val_score # cross validation metrics
from sklearn.model_selection import cross_val_predict # cross validation metrics

## Framework
1. Model
2. Fit
3. Predict

In [None]:
# data
fashionBIG = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionBIG.csv")
fashionBIG.head()

Unnamed: 0,age,income,months_subbed,upgrade
0,22,55.89,14,0
1,32,86.03,57,0
2,38,49.22,37,1
3,14,92.71,51,1
4,33,94.06,37,0


In [None]:
predictors = ["age", "income", "months_subbed"]

X_train, X_test, y_train, y_test = train_test_split(fashionBIG[predictors], fashionBIG["upgrade"], test_size=0.2)
X_train.head()

zscore = StandardScaler()
zscore.fit(X_train)
Xz_train = zscore.transform(X_train)
Xz_test = zscore.transform(X_test)

In [None]:
myLogit = LogisticRegression(penalty = "none") #create

In [None]:
myLogit.fit(Xz_train,y_train) #fit

LogisticRegression(penalty='none')

In [None]:
predictedVals = myLogit.predict(Xz_test) #predict

In [None]:
accuracy_score(y_test,predictedVals)

0.605

In [None]:
confusion_matrix(y_test,predictedVals)

array([[ 14,  66],
       [ 13, 107]])

## LR Coef interpretation


In [None]:
coef = pd.DataFrame({"Coefs": myLogit.coef_[0],
                     "Names": predictors})
coef = coef.append({"Coefs": myLogit.intercept_[0],
                    "Names": "intercept"}, ignore_index = True)
coef

Unnamed: 0,Coefs,Names
0,0.433659,age
1,-0.024124,income
2,0.022078,months_subbed
3,0.310342,intercept


In [None]:
coef["Odds Coefs"] = np.exp(coef["Coefs"])
coef

Unnamed: 0,Coefs,Names,Odds Coefs
0,0.433659,age,1.542893
1,-0.024124,income,0.976164
2,0.022078,months_subbed,1.022324
3,0.310342,intercept,1.363892


## LR different thresholds


In [None]:
fashionNEW = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/SKP_fashionNEW.csv")

Xnew = fashionNEW.iloc[:,0:3]
Xnewz = zscore.transform(Xnew)

In [None]:
Ypred_prob = myLogit.predict_proba(Xnewz)
Ypred_prob[1:10]

array([[0.33352391, 0.66647609],
       [0.51058513, 0.48941487],
       [0.48671225, 0.51328775],
       [0.38736389, 0.61263611],
       [0.43731095, 0.56268905],
       [0.49350338, 0.50649662],
       [0.42049844, 0.57950156],
       [0.34288072, 0.65711928],
       [0.59666517, 0.40333483]])

In [None]:
Ypred_prob1 = Ypred_prob[:, 1]
Ypred_prob1[1:100]

array([0.66647609, 0.48941487, 0.51328775, 0.61263611, 0.56268905,
       0.50649662, 0.57950156, 0.65711928, 0.40333483, 0.61200779,
       0.60617051, 0.34096115, 0.53596495, 0.50472389, 0.50366817,
       0.36774573, 0.5639286 , 0.7543641 , 0.59683045, 0.55232896,
       0.5772686 , 0.62792199, 0.61281989, 0.40396542, 0.66434867,
       0.58509306, 0.53827871, 0.61374145, 0.55089943, 0.55508281,
       0.49800652, 0.44133272, 0.5791442 , 0.45438523, 0.3577695 ,
       0.76940625, 0.67256025, 0.60801952, 0.34885152, 0.55530006,
       0.72217162, 0.57143869, 0.7460572 , 0.53706677, 0.60180222,
       0.59056411, 0.69688744, 0.64757569, 0.58100633, 0.54584336,
       0.63367889, 0.66643261, 0.65575346, 0.60943103, 0.4216202 ,
       0.55249512, 0.7102608 , 0.64387191, 0.53848637, 0.5892712 ,
       0.70687109, 0.58040648, 0.63726799, 0.51695323, 0.61262162,
       0.68954718, 0.65447797, 0.75650424, 0.60437214, 0.49803398,
       0.72278565, 0.33622437, 0.4540138 , 0.43607022, 0.73422

In [None]:
thresh = 0.3

Ypred_prob1_thresh = (Ypred_prob1 > thresh) * 1
Ypred_prob1_thresh[1:100]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
accuracy_score(fashionNEW["upgrade"], Ypred_prob1_thresh)

0.604

## LR with Cross Validation

In [None]:
# Kfold

X = fashionBIG[["age","income","months_subbed"]]
y = fashionBIG["upgrade"]

# create k-fold object
kf = KFold(n_splits = 5)
kf.split(X)

lr = LogisticRegression() #create model

acc = [] #create empty list to store accuracy for each fold

In [None]:
# Use a for loop to loop through each fold and train a model, then add the accuracy to acc.

for train_indices, test_indices in kf.split(X):
    # Get your train/test for this fold
    X_train = X.iloc[train_indices]
    X_test  = X.iloc[test_indices]
    y_train = y[train_indices]
    y_test  = y[test_indices]
    
    # model
    model = lr.fit(X_train, y_train)
    # record accuracy
    acc.append(accuracy_score(y_test, model.predict(X_test)))
    
#print overall acc
print(acc)
np.mean(acc)

[0.58, 0.605, 0.54, 0.565, 0.645]


0.587

## Regularization

In [None]:
# Default Regularization
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

myLogit = LogisticRegression() #create
myLogit.fit(Xz_train,y_train) #fit

print(myLogit.coef_)
print(myLogit.intercept_)

[[-0.06250222 -0.01314886  0.09062657]]
[0.27249525]


In [None]:
myLogit2 = LogisticRegression(penalty = "none") #create
myLogit2.fit(Xz_train,y_train) #fit

print(myLogit2.coef_)
print(myLogit2.intercept_)

[[-0.06281615 -0.01320505  0.09108865]]
[0.27250372]
