In [679]:
# Importing all necessary libraries
# munging imports
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

# modeling imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Model Exploration

*In this notebook, I will use a smaller subset of the data to explore and compare the models where I need to dummify variables before choosing a final model and training on all the available data points.*

Pull in a sample of the data.

In [737]:
#pull in a 10,000 datapoint sample from our dataframe, with unecessary columns dropped.
sample_df=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/FinalFlightsData.csv').drop(columns=['Unnamed: 0',
                                                                                                    'DEP_TIME',
                                                                                                    'DEP_DELAY',
                                                                                                    'ARR_DELAY',
                                                                                                    'CANCELLED',
                                                                                                    'DATE',
                                                                                                    'FLIGHT_NUM',
                                                                                                    'origin_lat',
                                                                                                    'origin_lon',
                                                                                                    'dest_lat',
                                                                                                    'dest_lon',
                                                                                                    'CRS_DEP_TIME',
                                                                                                    'CRS_ARR_TIME',
                                                                                                   'municipality1',
                                                                                                   'municipality2',
                                                                                                   'origin_type','dest_type']).sample(n=10000,
                                                                                                                        random_state=6)

In [683]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 6469840 to 1051986
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AIRLINE           40000 non-null  object 
 1   ORIGIN            40000 non-null  object 
 2   DEST              40000 non-null  object 
 3   CRS_ELAPSED_TIME  40000 non-null  float64
 4   DISTANCE          40000 non-null  float64
 5   MONTH             40000 non-null  int64  
 6   DAYOFWEEK         40000 non-null  int64  
 7   holiday_szn       40000 non-null  bool   
 8   DEP_HOUR          40000 non-null  int64  
 9   ARR_HOUR          40000 non-null  int64  
 10  target            40000 non-null  bool   
 11  origin_weather    19654 non-null  object 
 12  origin_severity   19654 non-null  object 
 13  dest_weather      19656 non-null  object 
 14  dest_severity     19656 non-null  object 
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 4.3+ MB


Fill NA values for weather information.

In [738]:
sample_df.fillna({'origin_weather':'Normal','origin_severity':0,'dest_weather':'Normal','dest_severity':0},inplace=True)

Dummify all features to make them viable for model creation.

In [739]:
#Airline
sample_df = sample_df.join(pd.get_dummies(sample_df['AIRLINE'],drop_first=True))

#drop original
sample_df.drop('AIRLINE',axis=1,inplace=True)

In [740]:
#Origin
sample_df = sample_df.join(pd.get_dummies(sample_df['ORIGIN'],drop_first=True))

#drop original
sample_df.drop('ORIGIN',axis=1,inplace=True)

Because we have matching values in the origin and dest columns and have already made dummy variables for all our origin airports, we must change those in the destination column slightly to be able to make dummy variables pertaining to the destination. So, we coerce the destination column by adding "2" to each data point before making dummies to indicate destination airport code.

In [741]:
sample_df['DEST'] = sample_df['DEST'].astype(str) + '2'

In [742]:
#Dest
sample_df = sample_df.join(pd.get_dummies(sample_df['DEST'],drop_first=True))

#drop original
sample_df.drop('DEST',axis=1,inplace=True)

In [743]:
#Month
    #two features cos and sin:
sample_df['sin_month']=np.sin(2*np.pi*sample_df.MONTH/12)
sample_df['cos_month']=np.cos(2*np.pi*sample_df.MONTH/12)

#drop original
sample_df.drop('MONTH',axis=1,inplace=True)

In [744]:
#Day of Week
    #two features cos and sin:
sample_df['sin_DOW']=np.sin(2*np.pi*sample_df.DAYOFWEEK/7)
sample_df['cos_DOW']=np.cos(2*np.pi*sample_df.DAYOFWEEK/7)

#drop original
sample_df.drop('DAYOFWEEK',axis=1,inplace=True)

In [694]:
# #origin Type
# sample_df = sample_df.join(pd.get_dummies(sample_df['origin_type'],drop_first=True))

# #drop original
# sample_df.drop('origin_type',axis=1,inplace=True)

Similar to above with our airport codes, we have to slightly change the destination type by adding "2" .

In [695]:
# sample_df['dest_type'] = sample_df['dest_type'].astype(str) + '2'

In [696]:
# # dest type
# sample_df = sample_df.join(pd.get_dummies(sample_df['dest_type'],drop_first=True))

# #drop original
# sample_df.drop('dest_type',axis=1,inplace=True)

In [745]:
#Dep hour
sample_df['sin_DEP_HOUR']=np.sin(2*np.pi*sample_df.DEP_HOUR/24)
sample_df['cos_DEP_HOUR']=np.cos(2*np.pi*sample_df.DEP_HOUR/24)

#drop original
sample_df.drop('DEP_HOUR',axis=1,inplace=True)

In [746]:
# Arr hour
sample_df['sin_ARR_HOUR']=np.sin(2*np.pi*sample_df.ARR_HOUR/24)
sample_df['cos_ARR_HOUR']=np.cos(2*np.pi*sample_df.ARR_HOUR/24)

#drop original
sample_df.drop('ARR_HOUR',axis=1,inplace=True)

In [747]:
# origin weather
sample_df = sample_df.join(pd.get_dummies(sample_df['origin_weather'],drop_first=True))

#drop original
sample_df.drop('origin_weather',axis=1,inplace=True)

Again, we will do the same for destination weather.

In [748]:
sample_df['dest_weather'] = sample_df['dest_weather'].astype(str) + '2'

In [749]:
# dest weather
sample_df = sample_df.join(pd.get_dummies(sample_df['dest_weather'],drop_first=True))

#drop original
sample_df.drop('dest_weather',axis=1,inplace=True)

We will be converting the weather severity columns into ordinal data.

Specifically, there are 6 values:
1. Light
2. Moderate
3. Severe
4. UNK (unknown)
5. Heavy
6. Other

For the light,moderate, heavy, and severe, we will make light = 1, moderate = 2, heavy = 3, severe =4. 
The other categories will be removed from the dataset since we do not definetively know, and this could skew our results. 

In [750]:
#Drop unknown and others fully from dataset considering they have weather data but no severity level.
sample_df = sample_df[sample_df.origin_severity != 'UNK']
sample_df = sample_df[sample_df.origin_severity != 'Other']

In [751]:
# origin severity

# lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)

sample_df['origin_sev'] =sample_df['origin_severity'].apply(lambda x: 1 if x == "Light" else(2 if x == "Moderate" else(3 if x == "Heavy" else (4 if x== "Severe" else 0 ))))
sample_df.drop(columns=['origin_severity'],inplace=True)


In [752]:
# dest severity
sample_df['dest_sev'] =sample_df['dest_severity'].apply(lambda x: 1 if x == "Light" else(2 if x == "Moderate" else(3 if x == "Heavy" else (4 if x== "Severe" else 0 ))))

#drop original columns for origin and dest severity
sample_df.drop(columns=['dest_severity'],inplace=True)

## Metric Decision: F1

The metric that will be used to compare models will be the balanced F1 metric to find an optimal balance between precision and recall. We imagine it is equally harmful/beneficial to increase our recall rate as it is to increase our precision rate when classifiying a future flight as "delayed" or not. 

## Dealing with Class Imbalance:

In [643]:
sample_df.target.value_counts()

False    81158
True     17507
Name: target, dtype: int64

In [644]:
Fal = 8148
Tru = 1715
Tot = Fal + Tru

print(f'The percentage of NOT Delayed is {round((Fal/Tot)*100)}%')
print(f'The percentage of Delayed is {round((Tru/Tot)*100)}%')

The percentage of NOT Delayed is 83%
The percentage of Delayed is 17%


Because our target classes are strongly imbalanced (83% vs 17%), we will use class_weights when modeling to make sure our models catch our smaller class.

## Create our first simple model with the sample data with all the features to 

Create basic KNN & Logistic Models.

In [753]:
#split training and test sets:
X = sample_df.drop(columns=['target'])
y = sample_df.target

#split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

#### KNN Model

In [754]:
std_scale = StandardScaler()
X_train_scaled = std_scale.fit_transform(X_train)
X_test_scaled = std_scale.transform(X_test)

In [755]:
# Train on training set, and Test on testing set
knn = KNeighborsClassifier(n_neighbors=5,weights='distance')
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

# Accuracy Score: 0.7161088573360379
# Precision Score: 0.1987724268177526
# Recall Score: 0.20211233797407585
# F1 Score: 0.20042846941204476

Accuracy Score: 0.777289624873268
Precision Score: 0.18421052631578946
Recall Score: 0.08155339805825243
F1 Score: 0.11305518169582773


#### Logistic Model

In [783]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:1,1:5.3}) # setting C very high essentially removes regularization
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)

print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

# Accuracy Score: 0.8196416497633536
# Precision Score: 0.2616822429906542
# Recall Score: 0.01344215074411906
# F1 Score: 0.02557077625570776

Accuracy Score: 0.5542412977357215
Precision Score: 0.21326676176890158
Recall Score: 0.5805825242718446
F1 Score: 0.3119457485654669


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [784]:
# training data metrics: Shows this is JUST a bad Model!

y_pred = lr.predict(X_train_scaled)

print(f'Accuracy Score: {metrics.accuracy_score(y_train, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_train, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_train, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_train, y_pred)}')

# Accuracy Score: 0.8248143452273139
# Precision Score: 0.5666666666666667
# Recall Score: 0.020987654320987655
# F1 Score: 0.04047619047619047

Accuracy Score: 0.6423812282734647
Precision Score: 0.2955204640670319
Recall Score: 0.7641666666666667
F1 Score: 0.4262142691145712


### The following is a lump of code that removes columns with zeroed out coefficient in the logistic model. ( but on a copy of sample_df so we can readjust things as needed.)

In [940]:
df.info()

NameError: name 'df' is not defined

In [904]:
colums = pd.Series(copy_df.columns)
coefs = pd.Series(lr.coef_[0]).abs()
frames = [colums,coefs]
coeficients = pd.concat(frames,axis=1)
coeficients.rename({0:'Col',1:'Coef'},axis=1,inplace=True)
zeroed_cos = coeficients[coeficients['Coef']==0]
zeroed_cos.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zeroed_cos.dropna(inplace=True)


In [905]:
#drop all columns with lr coef 0.
copy_df.drop(columns=zeroed_cos.Col,axis=1,inplace=True)

In [906]:
#to see how many columns are left
copy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9863 entries, 6469840 to 6616243
Columns: 543 entries, CRS_ELAPSED_TIME to dest_sev
dtypes: bool(2), float64(9), int64(2), uint8(530)
memory usage: 5.9 MB


Fit logistic on this copy df

In [907]:
#split training and test sets:
Xcop = copy_df.drop(columns=['target'])
ycop = copy_df.target

#split into train and test
X_traincop, X_testcop, y_traincop, y_testcop = train_test_split(Xcop, ycop, test_size=0.3, random_state=42,stratify=y)

In [908]:
std_scale = StandardScaler()
X_train_scaledcop = std_scale.fit_transform(X_traincop)
X_test_scaledcop = std_scale.transform(X_testcop)

In [909]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={0:1,1:5.3}) # setting C very high essentially removes regularization
lr.fit(X_train_scaledcop, y_traincop)

y_predcop = lr.predict(X_test_scaledcop)

print(f'Accuracy Score: {metrics.accuracy_score(y_testcop, y_predcop)}')
print(f'Precision Score: {metrics.precision_score(y_testcop, y_predcop)}')
print(f'Recall Score: {metrics.recall_score(y_testcop, y_predcop)}')
print(f'F1 Score: {metrics.f1_score(y_testcop, y_predcop)}')

Accuracy Score: 0.5552551537681649
Precision Score: 0.21855235418130708
Recall Score: 0.6038834951456311
F1 Score: 0.32094943240454077


In [939]:
lr2 = LogisticRegression(class_weight={0:1,1:5.9}) # setting C very high essentially removes regularization
lr2.fit(X_train_scaledcop, y_traincop)


y_predcop = (lr2.predict_proba(X_test_scaledcop)[:,1] >= 0.51).astype(bool)

print(f'Accuracy Score: {metrics.accuracy_score(y_testcop, y_predcop)}')
print(f'Precision Score: {metrics.precision_score(y_testcop, y_predcop)}')
print(f'Recall Score: {metrics.recall_score(y_testcop, y_predcop)}')
print(f'F1 Score: {metrics.f1_score(y_testcop, y_predcop)}')

Accuracy Score: 0.5376816492058127
Precision Score: 0.21736249171636846
Recall Score: 0.6368932038834951
F1 Score: 0.3241106719367589
