In [130]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from mord import LogisticIT
import matplotlib.pylab as plt
import seaborn as sns
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [131]:
flights_df = pd.read_csv('FlightDelays.csv')
flights_df.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight Status
0,1455,OH,1455,JFK,184,37987,5935,BWI,0,4,1,N940CA,ontime
1,1640,DH,1640,JFK,213,2004-01-01,6155,DCA,0,4,1,N405FJ,ontime
2,1245,DH,1245,LGA,229,2004-01-01,7208,IAD,0,4,1,N695BR,ontime
3,1715,DH,1709,LGA,229,2004-01-01,7215,IAD,0,4,1,N662BR,ontime
4,1039,DH,1035,LGA,229,2004-01-01,7792,IAD,0,4,1,N698BR,ontime


In [132]:
flights_df.drop(columns=['FL_DATE','DAY_OF_MONTH','TAIL_NUM'], inplace=True)
flights_df.columns = [c.replace(' ', '_') for c in flights_df.columns]

In [133]:
flights_df.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_NUM,ORIGIN,Weather,DAY_WEEK,Flight_Status
0,1455,OH,1455,JFK,184,5935,BWI,0,4,ontime
1,1640,DH,1640,JFK,213,6155,DCA,0,4,ontime
2,1245,DH,1245,LGA,229,7208,IAD,0,4,ontime
3,1715,DH,1709,LGA,229,7215,IAD,0,4,ontime
4,1039,DH,1035,LGA,229,7792,IAD,0,4,ontime


In [134]:
flights_df['Flight_Status'] = flights_df['Flight_Status'].astype('category')
new_categories = {'ontime': '1', 'delayed': '0'}
flights_df.Flight_Status.cat.rename_categories(new_categories, inplace=True)

  res = method(*args, **kwargs)


In [135]:
x = ['CRS_DEP_TIME', 'CARRIER','DEP_TIME','DEST','DAY_WEEK', 'DISTANCE','FL_NUM','ORIGIN','Weather']
y = ['Flight Status']


train_get = pd.get_dummies(flights_df[x])

train_get.head()

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DAY_WEEK,DISTANCE,FL_NUM,Weather,CARRIER_CO,CARRIER_DH,CARRIER_DL,CARRIER_MQ,CARRIER_OH,CARRIER_RU,CARRIER_UA,CARRIER_US,DEST_EWR,DEST_JFK,DEST_LGA,ORIGIN_BWI,ORIGIN_DCA,ORIGIN_IAD
0,1455,1455,4,184,5935,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0
1,1640,1640,4,213,6155,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
2,1245,1245,4,229,7208,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
3,1715,1709,4,229,7215,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1
4,1039,1035,4,229,7792,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1


In [136]:
flights_df.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_NUM,ORIGIN,Weather,DAY_WEEK,Flight_Status
0,1455,OH,1455,JFK,184,5935,BWI,0,4,1
1,1640,DH,1640,JFK,213,6155,DCA,0,4,1
2,1245,DH,1245,LGA,229,7208,IAD,0,4,1
3,1715,DH,1709,LGA,229,7215,IAD,0,4,1
4,1039,DH,1035,LGA,229,7792,IAD,0,4,1


In [137]:
Predictors = flights_df[['CRS_DEP_TIME', 'CARRIER','DEP_TIME','DEST','DAY_WEEK', 'DISTANCE','FL_NUM','ORIGIN','Weather']]

outcome = flights_df['Flight_Status']

flights = pd.get_dummies(Predictors, prefix_sep='_', drop_first=True)
flights.head()

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DAY_WEEK,DISTANCE,FL_NUM,Weather,CARRIER_DH,CARRIER_DL,CARRIER_MQ,CARRIER_OH,CARRIER_RU,CARRIER_UA,CARRIER_US,DEST_JFK,DEST_LGA,ORIGIN_DCA,ORIGIN_IAD
0,1455,1455,4,184,5935,0,0,0,0,1,0,0,0,1,0,0,0
1,1640,1640,4,213,6155,0,1,0,0,0,0,0,0,1,0,1,0
2,1245,1245,4,229,7208,0,1,0,0,0,0,0,0,0,1,0,1
3,1715,1709,4,229,7215,0,1,0,0,0,0,0,0,0,1,0,1
4,1039,1035,4,229,7792,0,1,0,0,0,0,0,0,0,1,0,1


In [138]:
df1 = pd.concat([flights, flights_df['Flight_Status']], axis=1)

In [139]:
df1.head()

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DAY_WEEK,DISTANCE,FL_NUM,Weather,CARRIER_DH,CARRIER_DL,CARRIER_MQ,CARRIER_OH,CARRIER_RU,CARRIER_UA,CARRIER_US,DEST_JFK,DEST_LGA,ORIGIN_DCA,ORIGIN_IAD,Flight_Status
0,1455,1455,4,184,5935,0,0,0,0,1,0,0,0,1,0,0,0,1
1,1640,1640,4,213,6155,0,1,0,0,0,0,0,0,1,0,1,0,1
2,1245,1245,4,229,7208,0,1,0,0,0,0,0,0,0,1,0,1,1
3,1715,1709,4,229,7215,0,1,0,0,0,0,0,0,0,1,0,1,1
4,1039,1035,4,229,7792,0,1,0,0,0,0,0,0,0,1,0,1,1


In [140]:
y = df1['Flight_Status']
X = df1.drop(columns=['Flight_Status'])

In [141]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [142]:
logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, solver='liblinear')

In [143]:
print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X.columns).transpose())
print()

intercept  0.008509310998325623
       CRS_DEP_TIME  DEP_TIME  DAY_WEEK  DISTANCE    FL_NUM   Weather  \
coeff      0.031797 -0.032137  0.054899  0.009245 -0.000024 -0.098039   

       CARRIER_DH  CARRIER_DL  CARRIER_MQ  CARRIER_OH  CARRIER_RU  CARRIER_UA  \
coeff    0.185431    0.360711   -0.463409    0.079538   -0.059711    0.017088   

       CARRIER_US  DEST_JFK  DEST_LGA  ORIGIN_DCA  ORIGIN_IAD  
coeff   -0.029231  0.051608   0.06268    -0.12494    0.009464  



In [144]:
logit_reg_pred = logit_reg.predict(valid_X)
logit_reg_proba = logit_reg.predict_proba(valid_X)
logit_result = pd.DataFrame({'actual': valid_y,
'p(0)': [p[0] for p in logit_reg_proba],
'p(1)': [p[1] for p in logit_reg_proba],
'predicted': logit_reg_pred })
print(logit_result)

     actual      p(0)      p(1) predicted
1276      1  0.142007  0.857993         1
1446      1  0.100284  0.899716         1
335       1  0.070346  0.929654         1
1458      1  0.135791  0.864209         1
2038      1  0.057262  0.942738         1
...     ...       ...       ...       ...
460       1  0.200111  0.799889         1
2063      1  0.049348  0.950652         1
159       1  0.250096  0.749904         1
2027      0  0.038904  0.961096         1
1247      1  0.047517  0.952483         1

[881 rows x 4 columns]


In [145]:
interestingCases = [1276,335,460,1458,2027]
print(logit_result.loc[interestingCases])

     actual      p(0)      p(1) predicted
1276      1  0.142007  0.857993         1
335       1  0.070346  0.929654         1
460       1  0.200111  0.799889         1
1458      1  0.135791  0.864209         1
2027      0  0.038904  0.961096         1


In [146]:
classificationSummary(train_y, logit_reg.predict(train_X))

Confusion Matrix (Accuracy 0.8902)

       Prediction
Actual    0    1
     0  127  134
     1   11 1048


In [147]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.9024)

       Prediction
Actual   0   1
     0  85  82
     1   4 710


In [155]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM
0,16,DH,1641,JFK,228,1/19/2004,7810,IAD,0,1,19,N327UE
1,6,DL,630,LGA,214,1/21/2004,1740,DCA,0,3,21,N242DL
2,6,MQ,558,JFK,213,1/6/2004,4760,DCA,0,2,6,N739MQ
3,18,DL,1828,LGA,214,1/21/2004,1764,DCA,0,3,21,N242DL


In [156]:
test.drop(columns=['FL_DATE','DAY_OF_MONTH','TAIL_NUM'], inplace=True)
test.columns = [c.replace(' ', '_') for c in test.columns]

In [157]:
test.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_NUM,ORIGIN,Weather,DAY_WEEK
0,16,DH,1641,JFK,228,7810,IAD,0,1
1,6,DL,630,LGA,214,1740,DCA,0,3
2,6,MQ,558,JFK,213,4760,DCA,0,2
3,18,DL,1828,LGA,214,1764,DCA,0,3


In [159]:
Predictors = test[['CRS_DEP_TIME', 'CARRIER','DEP_TIME','DEST','DAY_WEEK', 'DISTANCE','FL_NUM','ORIGIN','Weather']]


flights3 = pd.get_dummies(Predictors, prefix_sep='_', drop_first=True)
flights3.head()

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DAY_WEEK,DISTANCE,FL_NUM,Weather,CARRIER_DL,CARRIER_MQ,DEST_LGA,ORIGIN_IAD
0,16,1641,1,228,7810,0,0,0,0,1
1,6,630,3,214,1740,0,1,0,1,0
2,6,558,2,213,4760,0,0,1,0,0
3,18,1828,3,214,1764,0,1,0,1,0


In [153]:
coloumn_names =['DAY_WEEK','CRS_DEP_TIME','ORIGIN_BWI','ORIGIN_DCA','ORIGIN_IAD','DEST_EWR','DEST_JFK','DEST_LGA','CARRIER_CO','CARRIER_DH','CARRIER_DL','CARRIER_MQ','CARRIER_OH','CARRIER_RU','CARRIER_UA','CARRIER_US','FL_NUM','DISTANCE','Weather']
tes_X = tes_X.reindex(columns = coloumn_names)
a=tes_X.fillna(0)

print(a)

NameError: name 'tes_X' is not defined

In [128]:
logit_reg_pred = logit_reg.predict(a)

NameError: name 'a' is not defined