# logistic regression

## high-level questions

## imports

In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## prep

In [93]:
appointments = pd.read_csv("data/appointments_2.csv",
                          parse_dates=['scheduled_day','appointment_day'],
                          )
appointments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66746 entries, 0 to 66745
Data columns (total 19 columns):
Unnamed: 0             66746 non-null int64
appointment_id         66746 non-null int64
patient_id             66746 non-null float64
scheduled_day          66746 non-null datetime64[ns]
appointment_day        66746 non-null datetime64[ns]
age                    66746 non-null int64
neighborhood           66746 non-null object
scholarship            66746 non-null int64
hypertension           66746 non-null int64
diabetes               66746 non-null int64
alcoholism             66746 non-null int64
handicap               66746 non-null int64
sms_received           66746 non-null int64
no_show                66746 non-null int64
male                   66746 non-null int64
days_to_appointment    66746 non-null int64
scheduled_weekday      66746 non-null int64
appointment_weekday    66746 non-null int64
handicap_binary        66746 non-null int64
dtypes: datetime64[ns](2), float64

### get dummies

In [94]:
# get dummies for categorical data: appointment day, neighborhood
# only do this for categorical features we know have statistical significance
appointments_dummies = pd.get_dummies(appointments, columns=['appointment_weekday','neighborhood'],drop_first=True)

In [95]:
features_dummies = set(appointments_dummies.columns) - set(appointments.columns)

features_neighborhood = []
for column in features_dummies:
    if 'neighborhood' in column:
            features_neighborhood.append(column)
            
features_weekday = []
for column in features_dummies:
    if 'weekday' in column:
            features_weekday.append(column)

# logistic regression

## without dummies: take 1

This model allows me to play with various categorical predictors without managing all the dummy predictors. I assume that toggling anyone of these will have more of an impact than toggling an individual dummy.

In [96]:
# split the data for training and testing
predictors = [
    'age', 
    'scholarship', 
    'hypertension',
    'diabetes', 
    'alcoholism', 
    'sms_received',
    'male',
    'days_to_appointment', 
    'scheduled_weekday', 
    'appointment_weekday',
    'handicap_binary'
]        

predictors_2 = {
    'days_to_appointment', 
    'scholarship', 
#     'hypertension', 
#     'diabetes', 
#     'sms_received',
    'age',
#     'appointment_weekday', 
#     'handicap_binary'
}
    

x = appointments[predictors]
y = appointments['no_show']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# train the model
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [97]:
predictions = model.predict(x_test)

print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83      9466
          1       0.50      0.00      0.01      3884

avg / total       0.65      0.71      0.59     13350



In [98]:
pd.DataFrame(data=model.coef_[0],index=predictors,columns=['coefficient'])\
    .sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient
alcoholism,0.400799
scholarship,0.272325
diabetes,0.11508
days_to_appointment,0.007789
scheduled_weekday,-0.005419
age,-0.009857
appointment_weekday,-0.021444
handicap_binary,-0.02156
male,-0.024231
hypertension,-0.05852


## without dummies: take 2

I will toggle the features to try and get the best predictions.

In [99]:
# split the data for training and testing
predictors = [
    'age', 
    'scholarship', 
#     'hypertension',
#     'diabetes', 
    'alcoholism', 
    'sms_received',
#     'male',
    'days_to_appointment', 
#     'scheduled_weekday', 
#     'appointment_weekday',
#     'handicap_binary'
]        
    

x = appointments[predictors]
y = appointments['no_show']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# train the model
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [100]:
predictions = model.predict(x_test)

print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83      9466
          1       0.50      0.00      0.01      3884

avg / total       0.65      0.71      0.59     13350



In [101]:
pd.DataFrame(data=model.coef_[0],index=predictors,columns=['coefficient'])\
    .sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient
alcoholism,0.421504
scholarship,0.277441
days_to_appointment,0.00778
age,-0.009942
sms_received,-0.244444


<div class='alert alert-danger'>
I observed that the optimal list has very little to do with the coefficient size. I was able to remove features with higher coefficients without changing my outcomes. yet when I removed features with very little coefficients, I got worse outcomes. it appears that looking at a coefficient is the wrong way to determine if a feature should be part of the model.
<li>why is this the case?
</div>

## with dummies: take 1

take into account the optimizations made in the previous regression model by turning off certain predictors and including all of the dummies

In [102]:
# split the data for training and testing
predictors = [
    'age', 
    'scholarship', 
#     'hypertension',
#     'diabetes', 
    'alcoholism', 
    'sms_received',
#     'male',
    'days_to_appointment', 
#     'scheduled_weekday', 
#     'appointment_weekday',
#     'handicap_binary'
] + list(features_dummies)        
    

x = appointments_dummies[predictors]
y = appointments_dummies['no_show']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# train the model
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [103]:
predictions = model.predict(x_test)

print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83      9466
          1       0.47      0.01      0.01      3884

avg / total       0.64      0.71      0.59     13350



In [104]:
pd.DataFrame(data=model.coef_[0],index=predictors,columns=['coefficient'])\
    .sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient
neighborhood_ILHAS OCEÂNICAS DE TRINDADE,0.586143
neighborhood_HORTO,0.522034
neighborhood_GURIGICA,0.424360
neighborhood_ITARARÉ,0.384415
alcoholism,0.380018
neighborhood_ARIOVALDO FAVALESSA,0.347837
neighborhood_ENSEADA DO SUÁ,0.326879
neighborhood_JESUS DE NAZARETH,0.306876
neighborhood_SANTOS DUMONT,0.278623
neighborhood_SANTA CECÍLIA,0.272105


<div class='alert alert-danger'>
<li>why do some of these neighborhoods have such high coefficients, even though their correlations are very low?
<li>these neighborhoods have higher coefficients than my other features, but my guess is that there are so many of them that they decrease the precision by adding a lot of noise. does that mean there is potential to improve this model by removing some of these neighborhoods but not all? if so,which ones?
</div>

It looks like certain neighborhoods are very strong predictors, but there are so many of them that creates a lot of noise. I will keep the top third of neighborhoods with the highest correlations.

### neighborhood correlations

In [105]:
correlations = appointments_dummies.corr()

correlations = correlations.loc[features_neighborhood,'no_show']
correlations.sort_values(ascending=False)

neighborhood_JESUS DE NAZARETH              0.034742
neighborhood_ITARARÉ                        0.034299
neighborhood_GURIGICA                       0.026887
neighborhood_SANTOS DUMONT                  0.016241
neighborhood_CARATOÍRA                      0.014007
neighborhood_ANDORINHAS                     0.012861
neighborhood_ILHA DO PRÍNCIPE               0.012008
neighborhood_RESISTÊNCIA                    0.009998
neighborhood_SANTA CECÍLIA                  0.009560
neighborhood_MARIA ORTIZ                    0.009190
neighborhood_ILHAS OCEÂNICAS DE TRINDADE    0.008549
neighborhood_SANTA CLARA                    0.008108
neighborhood_HORTO                          0.007888
neighborhood_ARIOVALDO FAVALESSA            0.006062
neighborhood_SÃO BENEDITO                   0.005796
neighborhood_BONFIM                         0.005028
neighborhood_ROMÃO                          0.004999
neighborhood_PRAIA DO SUÁ                   0.004365
neighborhood_SANTO ANDRÉ                    0.

### predict only using neighborhood

I am confused as to why the coef for neihborhood is so high. Let me see if this is actually a good predictor on its own.

In [106]:
# split the data for training and testing
predictors = features_neighborhood 
    

x = appointments_dummies[predictors]
y = appointments_dummies['no_show']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# train the model
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [107]:
predictions = model.predict(x_test)

print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83      9466
          1       0.00      0.00      0.00      3884

avg / total       0.50      0.71      0.59     13350



  'precision', 'predicted', average, warn_for)


In [108]:
pd.DataFrame(data=model.coef_[0],index=predictors,columns=['coefficient'])\
    .sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient
neighborhood_ILHAS OCEÂNICAS DE TRINDADE,0.971799
neighborhood_HORTO,0.544376
neighborhood_GURIGICA,0.478791
neighborhood_JESUS DE NAZARETH,0.431662
neighborhood_ITARARÉ,0.403134
neighborhood_ARIOVALDO FAVALESSA,0.360927
neighborhood_SANTOS DUMONT,0.354084
neighborhood_SANTA CECÍLIA,0.348775
neighborhood_CARATOÍRA,0.250774
neighborhood_ANDORINHAS,0.248554


the coefs are high but the prediction is not good. 