In [2]:
# Importing all necessary libraries
# munging imports
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

# modeling imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Model Exploration

*In this notebook, I will use a smaller subset of the data to explore and compare the models where I need to dummify variables before choosing a final model and training on all the available data points.*

Pull in a sample of the data.

In [3]:
#pull in all data
df=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/FinalFlightsData.csv').drop(columns=['Unnamed: 0',
                                                                                                    'DEP_TIME',
                                                                                                    'DEP_DELAY',
                                                                                                    'ARR_DELAY',
                                                                                                    'CANCELLED',
                                                                                                    'DATE',
                                                                                                    'FLIGHT_NUM',
                                                                                                    'origin_lat',
                                                                                                    'origin_lon',
                                                                                                    'dest_lat',
                                                                                                    'dest_lon',
                                                                                                    'CRS_DEP_TIME',
                                                                                                    'CRS_ARR_TIME',
                                                                                                   'municipality1',
                                                                                                   'municipality2',
                                                                                                   'origin_type','dest_type'])

Fill NA values for weather information.

In [4]:
df.fillna({'origin_weather':'Normal','origin_severity':0,'dest_weather':'Normal','dest_severity':0},inplace=True)

Dummify all features to make them viable for model creation.

In [5]:
#Airline
df = df.join(pd.get_dummies(df['AIRLINE'],drop_first=True))

#drop original
df.drop('AIRLINE',axis=1,inplace=True)

In [6]:
#Origin
df = df.join(pd.get_dummies(df['ORIGIN'],drop_first=True))

#drop original
df.drop('ORIGIN',axis=1,inplace=True)

Because we have matching values in the origin and dest columns and have already made dummy variables for all our origin airports, we must change those in the destination column slightly to be able to make dummy variables pertaining to the destination. So, we coerce the destination column by adding "2" to each data point before making dummies to indicate destination airport code.

In [7]:
df['DEST'] = df['DEST'].astype(str) + '2'

In [8]:
#Dest
df = df.join(pd.get_dummies(df['DEST'],drop_first=True))

#drop original
df.drop('DEST',axis=1,inplace=True)

In [9]:
#Month
    #two features cos and sin:
df['sin_month']=np.sin(2*np.pi*df.MONTH/12)
df['cos_month']=np.cos(2*np.pi*df.MONTH/12)

#drop original
df.drop('MONTH',axis=1,inplace=True)

In [10]:
#Day of Week
    #two features cos and sin:
df['sin_DOW']=np.sin(2*np.pi*df.DAYOFWEEK/7)
df['cos_DOW']=np.cos(2*np.pi*df.DAYOFWEEK/7)

#drop original
df.drop('DAYOFWEEK',axis=1,inplace=True)

In [11]:
# #origin Type
# sample_df = sample_df.join(pd.get_dummies(sample_df['origin_type'],drop_first=True))

# #drop original
# sample_df.drop('origin_type',axis=1,inplace=True)

Similar to above with our airport codes, we have to slightly change the destination type by adding "2" .

In [12]:
# sample_df['dest_type'] = sample_df['dest_type'].astype(str) + '2'

In [13]:
# # dest type
# sample_df = sample_df.join(pd.get_dummies(sample_df['dest_type'],drop_first=True))

# #drop original
# sample_df.drop('dest_type',axis=1,inplace=True)

In [14]:
#Dep hour
df['sin_DEP_HOUR']=np.sin(2*np.pi*df.DEP_HOUR/24)
df['cos_DEP_HOUR']=np.cos(2*np.pi*df.DEP_HOUR/24)

#drop original
df.drop('DEP_HOUR',axis=1,inplace=True)

In [15]:
# Arr hour
df['sin_ARR_HOUR']=np.sin(2*np.pi*df.ARR_HOUR/24)
df['cos_ARR_HOUR']=np.cos(2*np.pi*df.ARR_HOUR/24)

#drop original
df.drop('ARR_HOUR',axis=1,inplace=True)

In [16]:
# origin weather
df = df.join(pd.get_dummies(df['origin_weather'],drop_first=True))

#drop original
df.drop('origin_weather',axis=1,inplace=True)

Again, we will do the same for destination weather.

In [17]:
df['dest_weather'] = df['dest_weather'].astype(str) + '2'

In [18]:
# dest weather
df = df.join(pd.get_dummies(df['dest_weather'],drop_first=True))

#drop original
df.drop('dest_weather',axis=1,inplace=True)

We will be converting the weather severity columns into ordinal data.

Specifically, there are 6 values:
1. Light
2. Moderate
3. Severe
4. UNK (unknown)
5. Heavy
6. Other

For the light,moderate, heavy, and severe, we will make light = 1, moderate = 2, heavy = 3, severe =4. 
The other categories will be removed from the dataset since we do not definetively know, and this could skew our results. 

In [19]:
#Drop unknown and others fully from dataset considering they have weather data but no severity level.
df = df[df.origin_severity != 'UNK']
df = df[df.origin_severity != 'Other']

In [20]:
# origin severity

# lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)

df['origin_sev'] =df['origin_severity'].apply(lambda x: 1 if x == "Light" else(2 if x == "Moderate" else(3 if x == "Heavy" else (4 if x== "Severe" else 0 ))))
df.drop(columns=['origin_severity'],inplace=True)


In [21]:
# dest severity
df['dest_sev'] = df['dest_severity'].apply(lambda x: 1 if x == "Light" else(2 if x == "Moderate" else(3 if x == "Heavy" else (4 if x== "Severe" else 0 ))))

#drop original columns for origin and dest severity
df.drop(columns=['dest_severity'],inplace=True)

In [25]:
# df.to_csv('/Users/mehikapatel/Flights_Project/FlightsDummified')

## Create our Final Full models with the data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12306041 entries, 0 to 12477211
Columns: 761 entries, CRS_ELAPSED_TIME to dest_sev
dtypes: bool(2), float64(10), int64(2), uint8(747)
memory usage: 9.8 GB


In [27]:
df_samp = df.sample(300000)

Create basic KNN & Logistic Models.

In [28]:
#split training and test sets:
X = df_samp.drop(columns=['target'])
y = df_samp.target

#split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [29]:
std_scale = StandardScaler()
X_train_scaled = std_scale.fit_transform(X_train)
X_test_scaled = std_scale.transform(X_test)

#### Logistic Model

In [74]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:1,1:5.3}) # setting C very high essentially removes regularization
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.5437888888888889
Precision Score: 0.23445269684081338
Recall Score: 0.6942355889724311
F1 Score: 0.35052753128015307


In [32]:
# training data metrics: Shows this is JUST a bad Model!

y_pred = lr.predict(X_train_scaled)

print(f'Accuracy Score: {metrics.accuracy_score(y_train, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_train, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_train, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_train, y_pred)}')

# Accuracy Score: 0.8248143452273139
# Precision Score: 0.5666666666666667
# Recall Score: 0.020987654320987655
# F1 Score: 0.04047619047619047

Accuracy Score: 0.5476095238095238
Precision Score: 0.23858359658571468
Recall Score: 0.7077683198625171
F1 Score: 0.3568691696340324


### The following is a lump of code that removes columns with zeroed out coefficient in the logistic model. ( but on a copy of sample_df so we can readjust things as needed.)

In [60]:
copy_df = df_samp.copy()

In [53]:
colums = pd.Series(copy_df.columns)
coefs = pd.Series(lr.coef_[0]).abs()
frames = [colums,coefs]
coeficients = pd.concat(frames,axis=1)
coeficients.rename({0:'Col',1:'Coef'},axis=1,inplace=True)
zeroed_cos = coeficients[coeficients['Coef']==0]
zeroed_cos.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zeroed_cos.dropna(inplace=True)


In [54]:
#drop all columns with lr coef 0.
copy_df.drop(columns=zeroed_cos.Col,axis=1,inplace=True)

In [61]:
#to see how many columns are left
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 2196431 to 7098487
Columns: 761 entries, CRS_ELAPSED_TIME to dest_sev
dtypes: bool(2), float64(10), int64(2), uint8(747)
memory usage: 244.0 MB


Fit logistic on this copy df

In [56]:
#split training and test sets:
Xcop = copy_df.drop(columns=['target'])
ycop = copy_df.target

#split into train and test
X_traincop, X_testcop, y_traincop, y_testcop = train_test_split(Xcop, ycop, test_size=0.3, random_state=42,stratify=y)

In [57]:
std_scale = StandardScaler()
X_train_scaledcop = std_scale.fit_transform(X_traincop)
X_test_scaledcop = std_scale.transform(X_testcop)

In [69]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:1,1:5.3}) # setting C very high essentially removes regularization
lr.fit(X_train_scaledcop, y_traincop)

y_predcop = lr.predict(X_test_scaledcop)

print(f'Accuracy Score: {metrics.accuracy_score(y_testcop, y_predcop)}')
print(f'Precision Score: {metrics.precision_score(y_testcop, y_predcop)}')
print(f'Recall Score: {metrics.recall_score(y_testcop, y_predcop)}')
print(f'F1 Score: {metrics.f1_score(y_testcop, y_predcop)}')

Accuracy Score: 0.5422555555555556
Precision Score: 0.2338599118385253
Recall Score: 0.6947368421052632
F1 Score: 0.34992820286241777


* Accuracy Score: 0.5422555555555556
* Precision Score: 0.2338599118385253
* Recall Score: 0.6947368421052632
* F1 Score: 0.34992820286241777

In [59]:
lr2 = LogisticRegression(class_weight={0:1,1:5.9}) # setting C very high essentially removes regularization
lr2.fit(X_train_scaledcop, y_traincop)


y_predcop = (lr2.predict_proba(X_test_scaledcop)[:,1] >= 0.51).astype(bool)

print(f'Accuracy Score: {metrics.accuracy_score(y_testcop, y_predcop)}')
print(f'Precision Score: {metrics.precision_score(y_testcop, y_predcop)}')
print(f'Recall Score: {metrics.recall_score(y_testcop, y_predcop)}')
print(f'F1 Score: {metrics.f1_score(y_testcop, y_predcop)}')

Accuracy Score: 0.5115888888888889
Precision Score: 0.22774568722406985
Recall Score: 0.7337092731829574
F1 Score: 0.3475963607759325


2:
* Accuracy Score: 0.5118
* Precision Score: 0.22763911062653325
* Recall Score: 0.7325814536340852
* F1 Score: 0.3473455928225543

3:
* Accuracy Score: 0.5434888888888889
* Precision Score: 0.23395874804116726
* Recall Score: 0.6922305764411028
* F1 Score: 0.34971985692127505

Our above iterations do not improve the model, so we will stick with the original.

In [123]:
# Test it on certain flights!

test_points = df.sample(3)
X = test_points.drop(columns=['target'])
y = test_points.target

In [124]:
std_scale = StandardScaler()
X = std_scale.fit_transform(X)

In [125]:
lr.predict(X)

array([ True, False,  True])

In [126]:
y

11623755    False
5811482     False
9822348      True
Name: target, dtype: bool

In [102]:
df.loc[11623755]
df.loc[5811482]
df.loc[9822348]

CRS_ELAPSED_TIME     80.0
DISTANCE            337.0
holiday_szn         False
target              False
Allegiant Air           0
                    ...  
Rain2                   0
Snow2                   0
Storm2                  0
origin_sev              0
dest_sev                2
Name: 1514261, Length: 761, dtype: object