In [12]:
import pandas as pd
import numpy as np
import time
from sklearn.externals import joblib

In [37]:
df=pd.read_csv('final.csv')


In [38]:
df.shape

(431420, 31)

In [39]:
df.columns

Index(['DAY_OF_WEEK', 'FL_DATE', 'UNIQUE_CARRIER', 'FL_NUM', 'ORIGIN', 'DEST',
       'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE', 'DISTANCE', 'DEP_HOUR',
       'ARR_HOUR', 'YEAR', 'MONTH', 'DAY_OF_MONTH', 'DEP_HOURLYVISIBILITY',
       'DEP_HOURLYDRYBULBTEMPC', 'DEP_HOURLYWindSpeed', 'DEP_HOURLYPrecip',
       'ARR_HOURLYVISIBILITY', 'ARR_HOURLYDRYBULBTEMPC', 'ARR_HOURLYWindSpeed',
       'ARR_HOURLYPrecip', 'DEP_AVG_HOURLYVISIBILITY',
       'DEP_AVG_HOURLYDRYBULBTEMPC', 'DEP_AVG_HOURLYWindSpeed',
       'DEP_AVG_HOURLYPrecip', 'ARR_AVG_HOURLYVISIBILITY',
       'ARR_AVG_HOURLYDRYBULBTEMPC', 'ARR_AVG_HOURLYWindSpeed',
       'ARR_AVG_HOURLYPrecip'],
      dtype='object')

In [41]:
#Drop Variables which do not have correlation with arrival delays/cannot be predicted until the flight
df.drop(['YEAR','DAY_OF_MONTH','FL_NUM','DEP_AVG_HOURLYVISIBILITY','DEP_AVG_HOURLYDRYBULBTEMPC','DEP_AVG_HOURLYWindSpeed','DEP_AVG_HOURLYPrecip','ARR_AVG_HOURLYVISIBILITY','ARR_AVG_HOURLYDRYBULBTEMPC','ARR_AVG_HOURLYWindSpeed','ARR_AVG_HOURLYPrecip'],axis=1, inplace=True)
#Remove data redundancy
df['ARR_HOUR'] = df['ARR_HOUR'].apply(lambda x:0 if x == 24 else x)
#Drop rows with Null Values
#df.dropna(inplace=True)

#Convert to Dummy Variables
df = pd.concat([df,pd.get_dummies(df['MONTH'],drop_first=True,prefix="MONTH")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DAY_OF_WEEK'],drop_first=True,prefix="DAY_OF_WEEK")],axis=1)
df = pd.concat([df,pd.get_dummies(df['UNIQUE_CARRIER'],drop_first=True,prefix="UNIQUE_CARRIER")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ORIGIN'],drop_first=True,prefix="ORIGIN")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEST'],drop_first=True,prefix="DEST")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEP_HOUR'],drop_first=True,prefix="DEP_HOUR")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ARR_HOUR'],drop_first=True,prefix="ARR_HOUR")],axis=1)

df.drop(['MONTH','DAY_OF_WEEK','UNIQUE_CARRIER','ORIGIN','DEST','DEP_HOUR','ARR_HOUR'],axis=1,inplace=True)
#DELAY_YN -> Delay Yes or No -> 1 if Delay>5 minutes, else 0
df['DELAY_YN'] = df['ARR_DELAY'].apply(lambda x:1 if x>=5 else 0)


In [42]:
df.shape

(431420, 96)

In [43]:
#Create 'n' different Logistic Regression Models

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

n = 10 #Number of models to average over

for i in range(n):
    
    tic = time.time()
    #Create a randomly selected smaller dataset for training purpose
    #Each dataset should have negative and positive classes in the ratio 60:40
    df_split = df.loc[np.random.choice(df[df['DELAY_YN']==1].index, 400000, replace = True)]
    df_split2 = df.loc[np.random.choice(df[df['DELAY_YN']==0].index, 600000, replace = False)]
    df_split = df_split.append(df_split2, ignore_index=True)

    X_train, X_test, y_train, y_test = train_test_split(df_split.drop(['DELAY_YN','ARR_DELAY'],axis=1),
                                                    df_split['DELAY_YN'], test_size=0.10, random_state=101)

    logmodel = LogisticRegression()
    logmodel.fit(X_train,y_train)
    
    predictions = logmodel.predict(X_test)

    truePos = X_test[((predictions == 1) & (y_test == predictions))]
    falsePos = X_test[((predictions == 1) & (y_test != predictions))]
    trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
    falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

    TP = truePos.shape[0]
    FP = falsePos.shape[0]
    TN = trueNeg.shape[0]
    FN = falseNeg.shape[0]

    accuracy = float(TP + TN)/float(TP + TN + FP + FN)
    print('Accuracy: '+str(accuracy))
    
    joblib.dump(logmodel, str(i)+'_logmodel.pkl') 
    
    toc = time.time()
    print(str(i+1)+"th fold took " + str(toc-tic) + " seconds")    

ValueError: Cannot take a larger sample than population when 'replace=False'