# logistic regression

## imports

In [259]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [260]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# user generated wrapper function with p values
# https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d
# from p_values_for_logreg import LogisticReg

## prep

In [261]:
appointments = pd.read_csv("data/appointments_2.csv")
appointments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66746 entries, 0 to 66745
Data columns (total 20 columns):
Unnamed: 0             66746 non-null int64
appointment_id         66746 non-null int64
patient_id             66746 non-null float64
scheduled_day          66746 non-null object
appointment_day        66746 non-null object
age                    66746 non-null int64
neighborhood           66746 non-null object
scholarship            66746 non-null int64
hypertension           66746 non-null int64
diabetes               66746 non-null int64
alcoholism             66746 non-null int64
handicap               66746 non-null int64
sms_received           66746 non-null int64
no_show                66746 non-null int64
male                   66746 non-null int64
days_to_appointment    66746 non-null int64
scheduled_weekday      66746 non-null int64
appointment_weekday    66746 non-null int64
age_cubed              66746 non-null int64
handicap_binary        66746 non-null int64
dtypes

In [262]:
# drop the useless features
appointments = appointments.drop(['Unnamed: 0', 'appointment_id', 
                                  'patient_id', 'scheduled_day',
                                  'appointment_day','age_cubed'],
                                 axis=1
                                )

### get dummies

In [263]:
# get dummies for categorical data: appointment day, neighborhood
# only do this for categorical features we know have statistical significance
appointments_dummies = pd.get_dummies(
    appointments, 
    columns=['scheduled_weekday','appointment_weekday','neighborhood'],
    drop_first=True)

### export

In [281]:
# export dummies
appointments_dummies.to_csv('data/appointments_dummies.csv',index=False)

In [265]:
# create feature sets to facilitate trying different features in model
features_dummies = set(appointments_dummies.columns) - set(appointments.columns)

features_neighborhood = []
for column in features_dummies:
    if 'neighborhood' in column:
            features_neighborhood.append(column)
            
features_weekday = []
for column in features_dummies:
    if 'weekday' in column:
            features_weekday.append(column)

In [266]:
appointments = appointments.drop('neighborhood',axis=1)

### p values

In [267]:
# find the p values to know which features are most highly correlated
from sklearn.feature_selection import chi2

data = appointments.drop(['no_show'],axis=1)
X = data.loc[:,:]
y = appointments['no_show'].loc[:]

scores, p_values = chi2(X, y)


pd.DataFrame(data=p_values,index=data.columns,columns=['p_value'])\
    .sort_values(by='p_value', ascending=False)

Unnamed: 0,p_value
male,0.3378687
appointment_weekday,0.1485553
scheduled_weekday,0.08865173
handicap,0.04695163
handicap_binary,0.02694353
alcoholism,9.2493e-07
diabetes,2.129059e-08
sms_received,4.37892e-10
scholarship,4.867132e-30
hypertension,2.024157e-39


<div class='alert alert-danger'>
<li>source: https://stackoverflow.com/questions/22306341/python-sklearn-how-to-calculate-p-values
<li>These p values do not correspond to the research I did previously on correlations. Appointment weekday has a higher correlation than male,for example. Is this p value the same as what paul recommended?
<li>in retrospect, it looks like this was not to the right test
<li>separately, this library did not use the 'frequency' features of age and days to appointment. I must've done it wrong.
</div>

# logistic regression

## without dummies

This is a test run. a simpler model is easier to play around with.

In [268]:
# split the data for training and testing
x = appointments.drop('no_show',axis=1)
y = appointments['no_show']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# train the model
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [269]:
predictions = model.predict(x_test)

print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83      9466
          1       0.50      0.00      0.01      3884

avg / total       0.65      0.71      0.59     13350



<div class='alert alert-danger'>
<li>I included age_cubed into my model, and it ruined it. does this mean that I need to use standard scaler to make this play nice with the others? if that is the case,screws how does the standard scaler differentiate between frequency and boolean variables?
</div>

### p values

In [270]:
# look at the p values for all predictors
# this will tell you which ones are most valuable

<div class='alert alert-danger'>
<li>https://datascience.stackexchange.com/questions/15398/how-to-get-p-value-and-confident-interval-in-logisticregression-with-sklearn
<li>according to the link above, there's no built-in way to do this. this seems problematic for scikit learn
</div>

In [271]:
# look at the coefficients just for curiosity sake
pd.DataFrame(data=model.coef_[0],index=x.columns,columns=['coefficient'])\
    .sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient
alcoholism,0.401292
scholarship,0.271958
diabetes,0.11489
handicap,0.01292
days_to_appointment,0.007787
scheduled_weekday,-0.005487
age,-0.009865
appointment_weekday,-0.021474
male,-0.024309
handicap_binary,-0.0263


<div class='alert alert-info'>
I observed that the optimal list has very little to do with the coefficient size. I was able to remove features with higher coefficients without changing my outcomes. yet when I removed features with very little coefficients, I got worse outcomes. it appears that looking at a coefficient is the wrong way to determine if a feature should be part of the model.
<li>why is this the case?
<li>Coefficients determine the slope of a function, not how well the function fits the data. High coefficients merely mean a different type of slope.
</div>

## with dummies

Here I use all predictors. With so many dummies we lose explanatory power, but we potentially increase prediction.

In [272]:
# split the data for training and testing
x = appointments_dummies.drop(['no_show'],axis=1)
y = appointments_dummies['no_show']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# train the model
model = LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [273]:
predictions = model.predict(x_test)

print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.71      1.00      0.83      9466
          1       0.52      0.01      0.01      3884

avg / total       0.65      0.71      0.59     13350



### coefficients and correlations

I want to see the connection between correlation and coefficient stats. this will not help the model, but I am curious.

In [280]:
# make coefficients in the data frame to merge with correlations
coefficients = pd.DataFrame(
    data=model.coef_[0],
    index=x.columns,
    columns=['coefficient']
)
    
# calculate correlations     
correlations = appointments_dummies.corr()
correlations = correlations.loc[features_neighborhood,'no_show']    

correlations = pd.DataFrame(
    data=correlations.values,
    index=correlations.index,
    columns=['correlation']
)

# join the 2
coefficients.merge(correlations,left_index=True,right_index=True)

Unnamed: 0,coefficient,correlation
neighborhood_ANDORINHAS,0.221020,0.012861
neighborhood_ANTÔNIO HONÓRIO,-0.190862,-0.004915
neighborhood_ARIOVALDO FAVALESSA,0.347410,0.006062
neighborhood_BARRO VERMELHO,0.211257,-0.001008
neighborhood_BELA VISTA,0.023389,-0.000075
neighborhood_BENTO FERREIRA,0.047623,-0.003097
neighborhood_BOA VISTA,-0.186964,-0.004543
neighborhood_BONFIM,0.071392,0.005028
neighborhood_CARATOÍRA,0.228883,0.014007
neighborhood_CENTRO,0.110621,0.000045


<div class='alert alert-info'>
why do some of these neighborhoods have such high coefficients, even though their 
correlations are very low?
<li>coefficient does not mean correlation
these neighborhoods have higher coefficients than my other features, but my guess is that there are so many of them that they decrease the precision by adding a lot of noise. does that mean there is potential to improve this model by removing some of these neighborhoods but not all? if so,which ones?
<li>a bad predictor is better than no predictor. you do not need to remove columns unless you are trying to optimize for time constraints, or explanatory power.
</div>

## ideas
https://en.wikipedia.org/wiki/Stepwise_regression