<a href="https://colab.research.google.com/github/patakrob/scrape/blob/main/O'Reilly_Class_Logistic_Regression_and_Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libraries

In [None]:
import numpy as np
import pandas as pd
import pandas_datareader.data as pdr
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('seaborn')

#Import Data

In [None]:
start = datetime(1982, 1, 1)
end = datetime(2020, 2, 29)

recession = pdr.DataReader('USREC', 'fred', start, end) #NBER business cycle classification
yield_curve = pdr.DataReader('T10Y3MM', 'fred', start, end) #Difference between the 3 month and 10 year treasury yields
unemployment = pdr.DataReader('UNRATE', 'fred', start, end) #Unemployment rate
industrial_capacity = pdr.DataReader('TCU', 'fred', start, end) #Total industrial capacity utilization

# Build and Train Model

In [None]:
#Create target dataframe
target = recession[1:] #Align target and feature rows
target.head()

Unnamed: 0_level_0,USREC
DATE,Unnamed: 1_level_1
1982-02-01,1
1982-03-01,1
1982-04-01,1
1982-05-01,1
1982-06-01,1


In [None]:
#Percentage of time the US economy was in recession since 1982
round(target['USREC'].sum()/target['USREC'].count()*100, 2)

9.63

In [None]:
#Create features dataframe
features = pd.DataFrame()
features['curve'] = yield_curve['T10Y3MM'].diff() #Difference between the yields of the 3 month bill and the 10 year note
features['unemployment'] = unemployment['UNRATE'].diff()
features['industrial'] = industrial_capacity['TCU'].diff()
features = features.dropna()
features.head()

Unnamed: 0_level_0,curve,unemployment,industrial
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982-02-01,-1.52,0.3,1.3662
1982-03-01,0.4,0.1,-0.6701
1982-04-01,-0.02,0.3,-0.8362
1982-05-01,0.38,0.1,-0.632
1982-06-01,0.31,0.2,-0.3165


In [None]:
#Create logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

#Need to covert column vector into a 1-d Numpy array
target = np.ravel(target)

classifier = LogisticRegression(random_state = 0)

scaler = StandardScaler() #Scales input data so that it has a mean of zero and variance of one
features_standardized = scaler.fit_transform(features) #Every feature now has zero mean and unit variance

#Split dataset into train and test subsets. Test size is 25% of the total dataset
from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.25, random_state=0)
classifier.fit(features_train, target_train)
print("Model coefficients:", classifier.coef_)


Model coefficients: [[ 1.1124784   3.12157688 -1.93397252]]


#Evaluate and Predict

In [None]:
print("Training score:", classifier.score(features_train, target_train))
print("Testing score:", classifier.score(features_test, target_test))

Training score: 0.9181286549707602
Testing score: 0.9304347826086956


In [None]:
#Use out-of-sample March data to predict recession
new = [[0.61, 0.9, -4.2351]]
classifier.predict(new)

  "X does not have valid feature names, but"


array([1])

In [None]:
#Quantify probability of recession
classifier.predict_proba(new).round(2)


array([[0., 1.]])

#Ridge Regression/L2 Regularization

In [None]:
 #Regularize logistic regression model with C hyperparameter. Reducing C increases regularization since it is the reciprocal of alpha. 
regularized_classifier2 = LogisticRegression(penalty='l2', C=0.1, random_state = 0) #L1 penalty is Lasso regression and L2 penalty is ridge regression

regularized_classifier2.fit(features_train, target_train)
print("Model coefficients:", regularized_classifier2.coef_)

Model coefficients: [[ 0.23390754  0.5814208  -1.02668254]]


In [None]:
print("Training score:", regularized_classifier2.score(features_train, target_train))
print("Testing score:", regularized_classifier2.score(features_test, target_test))

Training score: 0.9064327485380117
Testing score: 0.9043478260869565


#Lasso Regression/L1 Regularization

In [None]:
#Regularize logistic regression model with C hyperparameter. Reducing C increases regularization since it is the reciprocal of alpha. 
regularized_classifier1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state = 0) #L1 penalty is Lasso regression and is not supported by the default solver

regularized_classifier1.fit(features_train, target_train)
print("Model coefficients:", regularized_classifier1.coef_)

Model coefficients: [[ 0.        0.       -0.944083]]


In [None]:
print("Training score:", regularized_classifier1.score(features_train, target_train))
print("Testing score:", regularized_classifier1.score(features_test, target_test))

Training score: 0.9093567251461988
Testing score: 0.9043478260869565


# Model Evaluation

In [None]:
#Use K-fold cross validation (default folds = 5, default scoring metric = accuracy) 
from sklearn.model_selection import cross_val_score

score = cross_val_score(classifier, features_standardized, target)
score.mean(), score.std()

(0.9278786430960345, 0.03551943709272783)

In [None]:
#Use accuracy = (TP + TN)/(TP+TN+FP+FN) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "accuracy")

array([0.93478261, 0.88043478, 0.92307692, 0.91208791, 0.98901099])

In [None]:
#Use precision = TP/(TP+FP) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "precision")

array([0.8 , 0.25, 0.75, 0.6 , 1.  ])

In [None]:
#Use Sensitivity = TP/(TP + FN) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "recall")

array([0.44444444, 0.11111111, 0.33333333, 0.33333333, 0.875     ])

In [None]:
#Use F measure = 2*(precision*recall/(precision+recall)) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "f1")

array([0.57142857, 0.15384615, 0.46153846, 0.42857143, 0.93333333])