In [25]:
import pandas as pd
import numpy as np
import datetime, warnings, scipy
from collections import OrderedDict
from sklearn import metrics, linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import joblib

%matplotlib inline
warnings.filterwarnings("ignore")

In [3]:
!pip install kaggle
! mkdir ~/.kaggle 
! cp kaggle.json ~/.kaggle/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
! kaggle datasets list

ref                                                                title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
-----------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
victorsoeiro/netflix-tv-shows-and-movies                           Netflix TV Shows and Movies                           2MB  2022-05-15 00:01:23          13571        395  1.0              
zusmani/petrolgas-prices-worldwide                                 Petrol/Gas Prices Worldwide                          10KB  2022-06-24 01:25:33           1430         78  1.0              
ruchi798/data-science-job-salaries                                 Data Science Job Salaries                             7KB  2022-06-15 08:59:12           2433         82  1.0              
devansodariya/student-performance-data       

In [6]:
!kaggle datasets download -d usdot/flight-delays

Downloading flight-delays.zip to /content
 92% 175M/191M [00:00<00:00, 224MB/s]
100% 191M/191M [00:00<00:00, 249MB/s]


In [7]:
!unzip flight-delays.zip

Archive:  flight-delays.zip
  inflating: airlines.csv            
  inflating: airports.csv            
  inflating: flights.csv             


In [42]:
flights = pd.read_csv('flights.csv')

In [43]:
flights.shape

(5819079, 31)

In [44]:
flights_march = flights[flights['MONTH'] == 3]

In [45]:
flights_march.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
899159,2015,3,1,7,AA,2400,N5DEAA,LAX,DFW,5,...,,,0,1,B,,,,,
899160,2015,3,1,7,AS,98,N793AS,ANC,SEA,5,...,434.0,7.0,0,0,,,,,,
899161,2015,3,1,7,DL,2579,N67171,DEN,ATL,15,...,441.0,-30.0,0,0,,,,,,
899162,2015,3,1,7,US,2020,N561UW,PHX,CLT,15,...,537.0,-23.0,0,0,,,,,,
899163,2015,3,1,7,AA,258,N3HYAA,LAX,MIA,20,...,750.0,-14.0,0,0,,,,,,


In [46]:
flights_march['DELAY'] = flights_march['ARRIVAL_DELAY'].apply(lambda _: 1 if _ > 15 else 0)

In [47]:
to_remove = ['DAY','MONTH','YEAR','FLIGHT_NUMBER','TAIL_NUMBER','ORIGIN_AIRPORT','DESTINATION_AIRPORT','SCHEDULED_DEPARTURE',
             'DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME','AIR_TIME','WHEELS_ON','TAXI_IN','SCHEDULED_ARRIVAL',
             'ARRIVAL_TIME','ARRIVAL_DELAY','DIVERTED','CANCELLED','CANCELLATION_REASON','AIR_SYSTEM_DELAY',
             'SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']
flights_march.drop(to_remove, axis = 1, inplace = True)

In [48]:
flights_march.dropna(inplace = True)

In [49]:
flights_march = pd.concat([flights_march, pd.get_dummies(flights_march['AIRLINE'], drop_first = True)], axis = 1)
flights_march.drop(['AIRLINE'], axis = 1, inplace = True)

In [50]:
flights_march

Unnamed: 0,DAY_OF_WEEK,DEPARTURE_DELAY,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DELAY,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
899160,7,27.0,17.0,202.0,1448,0,1,0,0,0,0,0,0,0,0,0,0,0,0
899161,7,2.0,11.0,176.0,1199,0,0,0,1,0,0,0,0,0,0,0,0,0,0
899162,7,-1.0,11.0,225.0,1773,0,0,0,0,0,0,0,0,0,0,0,1,0,0
899163,7,-4.0,19.0,284.0,2342,0,0,0,0,0,0,0,0,0,0,0,0,0,0
899164,7,86.0,7.0,177.0,1299,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403466,2,-5.0,12.0,207.0,1546,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1403467,2,-2.0,21.0,172.0,1242,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1403468,2,-5.0,7.0,243.0,1825,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1403469,2,-8.0,12.0,250.0,1874,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [51]:
X = flights_march.drop(['DELAY'], axis = 1, inplace = False)
y = flights_march['DELAY']

In [52]:
X

Unnamed: 0,DAY_OF_WEEK,DEPARTURE_DELAY,TAXI_OUT,SCHEDULED_TIME,DISTANCE,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
899160,7,27.0,17.0,202.0,1448,1,0,0,0,0,0,0,0,0,0,0,0,0
899161,7,2.0,11.0,176.0,1199,0,0,1,0,0,0,0,0,0,0,0,0,0
899162,7,-1.0,11.0,225.0,1773,0,0,0,0,0,0,0,0,0,0,1,0,0
899163,7,-4.0,19.0,284.0,2342,0,0,0,0,0,0,0,0,0,0,0,0,0
899164,7,86.0,7.0,177.0,1299,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403466,2,-5.0,12.0,207.0,1546,0,0,0,0,1,0,0,0,0,0,0,0,0
1403467,2,-2.0,21.0,172.0,1242,0,0,0,0,0,0,0,0,1,0,0,0,0
1403468,2,-5.0,7.0,243.0,1825,0,0,0,0,0,0,0,0,0,1,0,0,0
1403469,2,-8.0,12.0,250.0,1874,0,0,0,0,0,0,0,0,0,1,0,0,0


In [53]:
features = X.columns

In [54]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2, stratify = y) 

In [55]:
y_train.value_counts(normalize=True)

0    0.814054
1    0.185946
Name: DELAY, dtype: float64

In [56]:
y_test.value_counts(normalize=True)

0    0.814057
1    0.185943
Name: DELAY, dtype: float64

In [57]:
scaler = StandardScaler()

X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

In [58]:
model_opened = joblib.load('knn_model.pkl')
model_opened

KNeighborsClassifier(metric='euclidean')

In [59]:
f1_score(y_test, model_opened.predict(X_test_scaler), average='macro')

0.8875667035457351

In [60]:
f1_score(y_test, model_opened.predict(X_test_scaler), average='binary')

0.8148233200332979

'macro':
        Calculate metrics for each label, and find their unweighted
mean.  This does not take label imbalance into account.

'binary':
Only report results for the class specified by pos_label. This is applicable only if targets (y_{true,pred}) are binary.