In [23]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#### Logistic Regression
- Slide Deck: 
https://www.canva.com/design/DAEfWtGKNgc/VqwX9iLNaf4TV7YJg6XflQ/view?utm_content=DAEfWtGKNgc&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton

- logistic regression in sklearn

Pros and Cons

In [2]:
from pydataset import data

df = data('iris')
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [3]:
# columns name change
df.columns = [col.lower().replace('.', '_') for col in df]
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [4]:
# Binary classification - predict if species is non-virginica or virginica
df['species'] = np.where(df.species == 'virginica', 1, 0)

In [5]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0


## Predict if species is virginica or not

In [7]:
def train_validate_test_split(df, target, seed=1349):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target],)
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [8]:
train, validate, test = train_validate_test_split(df, 'species')

In [9]:
train.shape, validate.shape, test.shape

((84, 5), (36, 5), (30, 5))

In [11]:
# Make new dataframes
X_train = train.drop(columns='species')
y_train = train.species
X_val = validate.drop(columns='species')
y_val = validate.species
X_test = test.drop(columns='species')
y_test = test.species

# Model 1

In [32]:
# Define the logistic regression model
logit = LogisticRegression()

In [33]:
#  fit the model on train data
logit.fit(X_train, y_train)

LogisticRegression()

In [34]:
# now use the model to make predictions
y_pred = logit.predict(X_train)

In [35]:
#take a look at predictions
y_pred[:5]

array([1, 0, 1, 0, 1])

In [36]:
# look at predicted probabilites for first 10 observations
logit.predict_proba(X_train)[:10][:,1] > 0.5

array([ True, False,  True, False,  True, False, False,  True, False,
       False])

In [37]:
logit.classes_

array([0, 1])

In [38]:
# View raw probabilities (output from the model)
y_pred_proba = pd.DataFrame(logit.predict_proba(X_train))
columns = ['non-virginica', 'virginica']
y_pred_proba.head()

Unnamed: 0,0,1
0,0.377272,0.622728
1,0.923081,0.076919
2,0.22102,0.77898
3,0.75716,0.24284
4,0.273458,0.726542


In [39]:
# classification report
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0
support,56.0,28.0,1.0,84.0,84.0


## Model 2

In [26]:
# Change hyperparameter C = 0.01
logit2 = LogisticRegression(C = 0.01)

In [27]:
# fit the model
logit2.fit(X_train, y_train)

LogisticRegression(C=0.01)

In [30]:
# make prediction
y_pred2 = logit2.predict(X_train)

In [31]:
#classification report
pd.DataFrame(classification_report(y_train, y_pred2, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.727273,1.0,0.75,0.863636,0.818182
recall,1.0,0.25,0.75,0.625,0.75
f1-score,0.842105,0.4,0.75,0.621053,0.694737
support,56.0,28.0,0.75,84.0,84.0


## Evaluate Model 1 and 2 performance on 'Validate'

In [40]:
# Make prediction for validate dataset
pd.DataFrame(classification_report(y_train, y_pred, output_dict=True))


Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0
support,56.0,28.0,1.0,84.0,84.0


In [41]:
pd.DataFrame(classification_report(y_train, y_pred2, output_dict=True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.727273,1.0,0.75,0.863636,0.818182
recall,1.0,0.25,0.75,0.625,0.75
f1-score,0.842105,0.4,0.75,0.621053,0.694737
support,56.0,28.0,0.75,84.0,84.0


### Hyperparameters
#### Regularization:
- Keep model simple
- Constraints the coefficients
- Discourages learning more complex model
- Minimizes overfitting
- L1 - Lasso
- L2 - Ridge

#### C = Inverse of regularization strength:

- Lower C - higher regularization
- Lower C discourages learning more complex model
- minimizes overfitting

## Bonus: Interpreting model coefficients

In [46]:
# look at model 1 coefficents
logit.coef_[0]


array([-0.12099641, -0.29704264,  2.43792504,  2.10506578])

#### Logistic Regression basics:

log(odds) = log(p/(1-p)) = $intercept$ + ($\beta_1$ * variable1) + ($\beta_2$ * variable2) + ($\beta_3$ * variable3)

**The coefficients above represents 'log odds'**

In [48]:
# Make a dataframe of coefficients and feature names

log_coeffs = pd.DataFrame(logit.coef_[0], index=X_train.columns, columns=['coeff'])
log_coeffs

Unnamed: 0,coeff
sepal_length,-0.120996
sepal_width,-0.297043
petal_length,2.437925
petal_width,2.105066


**It would be helpful to convert 'log odds' to 'odds'**

In [49]:
# convert from log odds to odds (exponentiate)
odds = np.exp(log_coeffs)
odds

Unnamed: 0,coeff
sepal_length,0.886037
sepal_width,0.743012
petal_length,11.449259
petal_width,8.207643


What is odds?

odds = P(occurring) / P(not occurring)  = p / (1-p)

Toss a fair coin
odds = 0.5 / (1-0.5) = 1   i.e. Odd of landing tails vs heads is 1:1 for fair coin

Rolling 2 or higher on a dice roll  
odd = (5/6) /  (1/6) = 5 i.e. Odd of rolling a 2 or higher on a dice is 5:1 for a fair die

#### Coefficient Interpretation (odds):


- **Example: petal_length: For every one unit increase in petal_length, we expect 10 times increase in odds of being a 'virginica' vs a 'non-virginica'.**


- **If the coefficient (odds) is 1 or close to 1, this means odds of being in class '1' (positive class) is same or close to being in class '0' (negative class). This means the feature with this coefficient is not a big driver for the target variable in this particular model**

- **If the coefficient value is < 1 , that implies that increase in value of that feature will decrease the odds that target variable is in positive class**