# Logistic Regression

In [33]:
#imports
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import logistic_regression_util

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

1. is there a relationship between covered area and home price?

2. How strong is the relationship? ${r^2 value}$

### Why NOT use linear regression?

- Binary data does not have normal distribution which is a condiction for most tyes of linear regression
- predicted target values can be neyond 0 and 1 in linear regression.
    - for probability, values outside of 0 and 1 do not make sense
- Probabilities are often not linear. can be U shaped
    ie. extreme values at the end.

#### Sidenote: Look into:
Logit Function
Sigmoid Function

### Pros:
   - Interpretability
   - we can choose to 'snap' predictions to 0 and 1 via a rule
   - its a fast model
   - very efficient
   - outputs clear predicted probabilites
   
### Cons:
   - assumes all predictors are independent of eachother
   - missing values must be dealth with prior to fitting
   - we cant solve non-linear problems with logistic regression
   - not always accurate as other classification algorithms.

__________________________

## EXERCISES

In [2]:
#grab iris dataset
from pydataset import data
df = data('iris')

#look at data
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [3]:
#change column names/ replace . with _ in column names
df.columns = [col.lower().replace('.', '_') for col in df]

#look at new column names
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [4]:
# we will have 2 different target variables 
dummies = pd.get_dummies(df['species'], drop_first=True)

#take a look at the dummie df
dummies.head()

Unnamed: 0,versicolor,virginica
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0


In [5]:
#concat dummies with original df
df = pd.concat([df, dummies], axis=1)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,versicolor,virginica
1,5.1,3.5,1.4,0.2,setosa,0,0
2,4.9,3.0,1.4,0.2,setosa,0,0
3,4.7,3.2,1.3,0.2,setosa,0,0
4,4.6,3.1,1.5,0.2,setosa,0,0
5,5.0,3.6,1.4,0.2,setosa,0,0


In [43]:
#df = pd.concat([df, dummies], axis=1).drop(columns=['species'])
#df.head()

In [6]:
#create split function
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [7]:
#split the data
train, validate, test = train_validate_test_split(df,target = 'versicolor', seed=123)

In [8]:
#get sizes of sets
train.shape, validate.shape, test.shape

((84, 7), (36, 7), (30, 7))

In [20]:
# Make new dataframes
X_train = train.drop(columns=['versicolor'])
y_train = train.versicolor

X_validate = validate.drop(columns=['versicolor'])
y_validate = validate.versicolor

X_test = test.drop(columns=['versicolor'])
y_test = test.versicolor

In [21]:
#take a look at X_train data
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,virginica
97,5.7,2.9,4.2,1.3,versicolor,0
125,6.7,3.3,5.7,2.1,virginica,1
87,6.7,3.1,4.7,1.5,versicolor,0
13,4.8,3.0,1.4,0.1,setosa,0
122,5.6,2.8,4.9,2.0,virginica,1


In [22]:
#take a look at y_train
y_train.head()

97     1
125    0
87     1
13     0
122    0
Name: versicolor, dtype: uint8

In [23]:
X_train.shape, y_train.shape

((84, 6), (84,))

### Hyperparameters

#### Regularization:
- Keep model simple
- Constraints the coefficients
- Discourages learning more complex model
- Minimizes overfitting
- avoid overfitting
- L1 - Lasso
- L2 - Ridge

#### C = Inverse of regularization strength:
- Lower C - higher regularization
- As C decreases, more coefficients become 0.
- Lower C discourages learning more complex model
- minimizes overfitting

In [27]:
#Define the logistic regression model
logit = LogisticRegression(C=0.1, class_weight={0:1, 1:99}, random_state= 123)

In [28]:
#fit the model with train data
logit.fit(X_train, y_train)

ValueError: could not convert string to float: 'versicolor'

In [None]:
#now use the model to make predictions
#y_pred = logit.predict(X_train)

In [None]:
#y_ped_proba = logit,predict_proba(X_train)

In [None]:
#y_ped_proba = pd.DataFrame(y_pred_probam, columns = ['non-versicolor', 'versicolor']

In [None]:
#print(classification_report(y_train, y_pred))

In [None]:
#df.versicolor.value_counts()

## Model 2

In [29]:
logit2 = LogisticRegression(C=0.1, class_weight={0:1, 1:99}, random_state= 123)

In [30]:
#fit the model
logit2.fit(X_train, y_train)

ValueError: could not convert string to float: 'versicolor'

In [None]:
y_pred2 = logit2.predict(X_train)

In [None]:
#classifcation report
print(classification_report(y_train, y_pred2))

## Evaluate Model 1 and Model 2

In [None]:
# Make prediction for validate dataset
y_pred_validate = logit.predict(X_validate)
y_pred_validate2 = logit2.predict(X_validate)

In [None]:
print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred_validate))

print('-------------------------------')

print(classification_report(y_validate, y_pred_validate))

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print("Model 2: solver = lbfgs, c = .1")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate, y_validate)))

print(confusion_matrix(y_validate, y_pred_validate2))

print(classification_report(y_validate, y_pred_validate2))

## Select which Model to evaluate on 'test' set

In [None]:
# Make prediction on X_test using model 1
#last step to take
y_pred_test = logit.predict(X_test)

In [None]:
# print classification report
print(classification_report(y_test, y_pred_test))

## Interpreting model coefficients

In [34]:
# look at model 1 coefficents and intercept
 
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [None]:
# look at model 1 coefficents only
logit.coef_[0]

#### Logistic Regression basics:¶
log(odds) = log(p/(1-p)) = $intercept$ + ($\beta_1$ variable1) + ($\beta_2$ variable2) + ($\beta_3$ * variable3)

####The coefficients above represents 'log odds'

In [38]:
# Make a dataframe of coefficients and feature names

log_coeffs = pd.DataFrame(logit.coef_[0], index = X_train.columns,
                          columns = ['coeffs']).sort_values(by = 'coeffs', ascending = True)
log_coeffs

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [39]:
# convert from log odds to odds (exponentiate)
odds = np.exp(log_coeffs)
odds

NameError: name 'log_coeffs' is not defined

In [45]:
#RC curve
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt