# Logistic Regression

In [15]:
# Import some example data
import pandas as pd
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv", encoding="latin_1")
display(data.head())

# Update data to set up for train test split, remove Restaurant Name column
data = data.loc[:, data.columns != 'Restaurant Name']
y = data['InMichelin'] # whether or not a restaurant is in the Michelin guide
X = data.loc[:, data.columns != 'InMichelin']

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [18]:
# Set up training and test data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# random_state ensures same data will be generated for example each time
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Set penalty to none since we are starting with non penalized logit, L1 and L2 are other options
logreg = LogisticRegression(penalty=None).fit(X_train, y_train)

print("logreg.coef_: {}".format(logreg.coef_))
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

predicted_vals = logreg.predict(X_test) # predictions for y
print("logreg.predict: {}".format(predicted_vals))

logreg.coef_: [[ 0.3817785   0.07436958 -0.15689     0.08189899]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [19]:
# You can see the raw probabilities that went into determining the y predictions
predicted_prob = logreg.predict_proba(X_test)
print("logreg.predict_proba: {}".format(predicted_prob))

logreg.predict_proba: [[0.85255621 0.14744379]
 [0.8446288  0.1553712 ]
 [0.67683217 0.32316783]
 [0.13743201 0.86256799]
 [0.86699615 0.13300385]
 [0.86173679 0.13826321]
 [0.83068042 0.16931958]
 [0.89753411 0.10246589]
 [0.53906147 0.46093853]
 [0.79534094 0.20465906]
 [0.77504428 0.22495572]
 [0.86070764 0.13929236]
 [0.73355788 0.26644212]
 [0.10933611 0.89066389]
 [0.00766565 0.99233435]
 [0.92517214 0.07482786]
 [0.85073211 0.14926789]
 [0.89511686 0.10488314]
 [0.72394995 0.27605005]
 [0.1360114  0.8639886 ]
 [0.63012902 0.36987098]
 [0.88145938 0.11854062]
 [0.03744012 0.96255988]
 [0.78424495 0.21575505]
 [0.90184835 0.09815165]
 [0.8390162  0.1609838 ]
 [0.82970658 0.17029342]
 [0.80844509 0.19155491]
 [0.86331046 0.13668954]
 [0.11763491 0.88236509]
 [0.55370252 0.44629748]
 [0.08942718 0.91057282]
 [0.31926949 0.68073051]
 [0.52149073 0.47850927]
 [0.73736448 0.26263552]
 [0.78091594 0.21908406]
 [0.58705592 0.41294408]
 [0.0075136  0.9924864 ]
 [0.1653582  0.8346418 ]
 [0

## Logistic Regression in statsmodels package

In [21]:
import statsmodels.api as sm

# Remember for statsmodels we need to add column of 1's
X_train_new = sm.add_constant(X_train)

# Generalized Linear Model and binomial family for Logistic regression
# Remember for statsmodels that y is passed in before X
model = sm.GLM(y_train, X_train_new, family=sm.families.Binomial()).fit()

model.summary()

0,1,2,3
Dep. Variable:,InMichelin,No. Observations:,123.0
Model:,GLM,Df Residuals:,118.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-57.266
Date:,"Tue, 01 Oct 2024",Deviance:,114.53
Time:,17:16:40,Pearson chi2:,254.0
No. Iterations:,6,Pseudo R-squ. (CS):,0.3534
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.6490,2.588,-4.115,0.000,-15.722,-5.576
Food,0.3818,0.148,2.572,0.010,0.091,0.673
Decor,0.0743,0.103,0.720,0.471,-0.128,0.277
Service,-0.1569,0.147,-1.070,0.285,-0.444,0.131
Price,0.0819,0.036,2.269,0.023,0.011,0.153


## Logistic Regression with constraints on size of coefficients

In [22]:
# SMALLER C will constrain Betas MORE (opposite of ridge/lasso regression).  It's a tuning parameter we can find using gridsearch.
# Note: L2 will shrink coefficients down, never reaching 0. L1 has potential to zero out coefficients

# C=100, compare coefs to regular model above.
logreg = LogisticRegression(C=100, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg.coef_))

print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.38167141  0.07437487 -0.15680599  0.08189124]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [23]:
# Now change to C=1, compare coefs to above models.
logreg = LogisticRegression(C=1, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))

print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.37185966  0.07491681 -0.14897335  0.08113449]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [24]:
# Now make C even smaller.  Set C=.0001, compare coefs to above models.
# Does the model's prediction power get better or worse??

logreg = LogisticRegression(C=.0001, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))

print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[0.00549409 0.00672588 0.00502436 0.02866608]]
Training set score: 0.699
Test set score: 0.732
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 0 0]


In [26]:
# What if we want to use an l1 penalty instead?  Change penalty to 'l1' and solver to 'liblinear'.
# Does the model's prediction power get better or worse? Do any coefficients shrink to 0 and drop out of model?
# Note: this can be helpful to understand feature importance for additional research.
# Solvers are used to optimize parameters of the model. Liblinear is commonly used solver that handles both L1 and L2.

logreg = LogisticRegression(C=.01, penalty='l1',solver='liblinear').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))

print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[-0.02290067  0.          0.          0.00967786]]
Training set score: 0.699
Test set score: 0.732
logreg.predict: [0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 0 1 0
 1 1 1 1]


## Multiclass models (Multinomial model)

In [29]:
from sklearn.datasets import load_iris
import numpy as np

# Three categories for the dependent variable - different iris flower types - setosa, versicolor, virginica
iris = load_iris()
X, y = iris.data, iris.target

print(iris.feature_names ) # X variable names
print(X[0:5]) # first five rows of data

print(iris.target_names) # target categories
print(np.unique(y)) # target values

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
['setosa' 'versicolor' 'virginica']
[0 1 2]


In [30]:
# Note the three argument changes to LogisticRegression():
# Set to multinomial, change solver to lbfgs, increase iterations

logreg = LogisticRegression(multi_class="multinomial",solver="lbfgs",max_iter=10000).fit(X,y)
print(logreg.predict(X))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]




In [33]:
# Softmax is used for to transform values into predicated probabilities
# Softmax is like logistic regression, but multi-class
# Here's how Softmax is calculated with some example data...
from math import exp

# Calculate each probability
p1 = exp(1) / (exp(1) + exp(3) + exp(2))
p2 = exp(3) / (exp(1) + exp(3) + exp(2))
p3 = exp(2) / (exp(1) + exp(3) + exp(2))

# Report probabilities
print(p1, p2, p3)

# Report sum of probabilities
print(p1 + p2 + p3)

0.09003057317038046 0.6652409557748219 0.24472847105479767
1.0


## Alternative scoring metrics

In [37]:
# Use the scoring argument in cross_val_score() to adjust metrics
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score

iris = datasets.load_iris()
X, y = iris.data, iris.target
clf = svm.SVC(probability=True, random_state=0)

# For now we will use 'accuracy' for classification models and r-squared in regression models
print(f'Accuracy: {cross_val_score(clf, X, y, scoring="accuracy").mean()}')
print(f'R-squared: {cross_val_score(clf, X, y, scoring="r2").mean()}')

Accuracy: 0.9666666666666666
R-squared: 0.95


In [39]:
# See all possible options
from sklearn import metrics
metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'd2_absolute_error_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'neg_root_mean_squared_log_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall