# Delays prediction - machine learning models (regression)

Katarzyna Mocio 429956 
Marcin Miszkiel 432418

# 1. Prepare necessary libraries


In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option("display.max_columns",100)

# Data wrangling
import numpy as np
from datetime import datetime as dt 

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

# Modelling with scikit-learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import optuna
import sklearn



Delays for regression
Your task is to apply various ML algorithms (see the rules below) to build a model explaining the delays of the flights based on the training sample and generate predictions for all observations from the test sample.

The dataset includes 1631327 observations in the training sample and 407832 in the test sample and the following columns:

Weekday – Day of the week when the flight occurred (1 for Sunday, 7 for Saturday).
Month_of_Year – The numerical month (1-12) when the flight took place.
Day_of_Month – The day of the month (1-31) when the flight occurred.
Scheduled_Departure_Time – The scheduled local time of flight departure.
Scheduled_Arrival_Time – The scheduled local time of flight arrival.
Marketing_Airline – The airline code under which the flight was marketed.
Marketing_Airline_DOT_ID – Department of Transportation identifier for the marketing airline.
Flight_Number – The number assigned to the flight by the operating airline.
Origin_Airport_ID – Unique identifier for the departure airport.
Destination_Airport_ID – Unique identifier for the arrival airport.
Flight_Cancelled – Indicator of whether the flight was cancelled (1 = Yes, 0 = No).
Departure_State – The state code of the departure location.
Arrival_State – The state code of the arrival location.
Departure_Delay – Total delay in minutes at departure.
Diverted_Airport_Landings – Count of unplanned landings at other airports.
Taxi_Out_Time – Time in minutes from gate departure until takeoff.
Taxi_In_Time – Time in minutes from landing to gate arrival.
Flight_Diverted – Indicator of whether the flight was diverted (1 = Yes, 0 = No).
Actual_Departure_Time – The actual local time the flight departed.
Flight_Duration – The duration of the flight in minutes from takeoff to landing.
Flight_Distance – The total distance covered by the flight in miles.
Origin_Temperature – The temperature at the origin airport at the time of the flight’s departure.
Destination_Temperature – The temperature at the destination airport at the time of the flight’s arrival.
Origin_Wind_Speed – The wind speed at the origin airport during the departure of the flight.
Destination_Wind_Speed – The wind speed at the destination airport during the departure of the flight.
Origin_Precipitation – The amount of precipitation, such as rain or snow, at the origin airport around the flight’s departure time.
Destination_Precipitation – The amount of precipitation, such as rain or snow, at the destination airport around the flight’s arrival time.
Arrival_Delay – Total delay in minutes at arrival (outcome variable, only in the training sample)

Let's load the data and look at the first five observations

In [4]:
delays = pd.read_csv('delays_train.csv')

delays.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Scheduled_Departure_Time,Scheduled_Arrival_Time,Marketing_Airline,Marketing_Airline_DOT_ID,Flight_Number,Origin_Airport_ID,Destination_Airport_ID,Flight_Cancelled,Departure_State,Arrival_State,Departure_Delay,Arrival_Delay,Diverted_Airport_Landings,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Actual_Departure_Time,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation
0,6,6,25,1222,1444,B6,20409,520,10397,12478.0,False,GA,NY,,,0,16.0,10.0,False,1224.0,107.0,760.0,25.14022,19.357739,21.019808,15.452723,0.0,0.0
1,2,6,21,1216,1304,,19805,6297,14107,15376.0,False,AZ,AZ,0.088687,-4.178483,0,16.0,5.0,False,1216.0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0
2,3,1,5,1945,2055,DL,19790,4124,13487,13076.0,False,MN,WI,,5.042185,0,34.0,5.0,False,1945.0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1
3,2,3,22,700,924,AA,19805,1538,15624,11298.0,False,FL,TX,-1.802698,-0.206932,0,10.0,11.0,False,658.0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0
4,4,7,14,2130,2359,,19930,1116,14747,12889.0,False,WA,NV,,14.006092,0,23.0,7.0,False,2133.0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0


In [10]:
delays = delays[delays['Flight_Cancelled'] == False]
delays

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Scheduled_Departure_Time,Scheduled_Arrival_Time,Marketing_Airline,Marketing_Airline_DOT_ID,Flight_Number,Origin_Airport_ID,Destination_Airport_ID,Flight_Cancelled,Departure_State,Arrival_State,Departure_Delay,Arrival_Delay,Diverted_Airport_Landings,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Actual_Departure_Time,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation
0,6,6,25,1222,1444,B6,20409,520,10397,12478.0,False,GA,NY,,,0,16.0,10.0,False,1224.0,107.0,760.0,25.140220,19.357739,21.019808,15.452723,0.0,0.0
1,2,6,21,1216,1304,,19805,6297,14107,15376.0,False,AZ,AZ,0.088687,-4.178483,0,16.0,5.0,False,1216.0,23.0,110.0,13.279939,20.470690,18.045064,12.910265,0.0,0.0
2,3,1,5,1945,2055,DL,19790,4124,13487,13076.0,False,MN,WI,,5.042185,0,34.0,5.0,False,1945.0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1
3,2,3,22,700,924,AA,19805,1538,15624,11298.0,False,FL,TX,-1.802698,-0.206932,0,10.0,11.0,False,658.0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0
4,4,7,14,2130,2359,,19930,1116,14747,12889.0,False,WA,NV,,14.006092,0,23.0,7.0,False,2133.0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631322,7,2,27,905,1115,AS,19930,1366,14057,12889.0,False,OR,NV,-9.009391,,0,14.0,5.0,False,856.0,,763.0,11.703410,11.850033,,,0.1,0.0
1631323,2,4,19,2239,720,AA,19805,276,14771,12478.0,False,CA,NY,-8.042209,-10.015618,0,24.0,30.0,False,2231.0,285.0,2586.0,33.962022,15.471513,25.784110,24.902274,0.1,0.0
1631324,5,6,10,734,751,UA,19977,5279,11721,13930.0,False,MI,IL,-1.045646,11.909120,0,40.0,9.0,False,733.0,41.0,223.0,12.915957,25.351905,11.846814,14.315929,0.0,0.0
1631325,3,3,30,1250,1345,AS,19930,421,14747,14057.0,False,WA,OR,-5.099172,-1.992090,0,18.0,11.0,False,1245.0,29.0,129.0,13.731863,13.932608,16.616156,18.730498,0.0,0.1


In [16]:
delays.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1581877 entries, 0 to 1631326
Data columns (total 28 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Weekday                    1581877 non-null  int64  
 1   Month_of_Year              1581877 non-null  int64  
 2   Day_of_Month               1581877 non-null  int64  
 3   Scheduled_Departure_Time   1581877 non-null  int64  
 4   Scheduled_Arrival_Time     1581877 non-null  int64  
 5   Marketing_Airline          1423267 non-null  object 
 6   Marketing_Airline_DOT_ID   1581877 non-null  int64  
 7   Flight_Number              1581877 non-null  int64  
 8   Origin_Airport_ID          1581877 non-null  int64  
 9   Destination_Airport_ID     1423761 non-null  float64
 10  Flight_Cancelled           1581877 non-null  bool   
 11  Departure_State            1581877 non-null  object 
 12  Arrival_State              1581877 non-null  object 
 13  Departure_Delay  

In [46]:
grouped_delays = delays[['Marketing_Airline', 'Marketing_Airline_DOT_ID']]
grouped_delays

Unnamed: 0,Marketing_Airline,Marketing_Airline_DOT_ID
0,B6,20409
1,,19805
2,DL,19790
3,AA,19805
4,,19930
...,...,...
1631322,AS,19930
1631323,AA,19805
1631324,UA,19977
1631325,AS,19930


In [48]:
value_counts_kolumna1 = delays['Marketing_Airline_DOT_ID'].value_counts()
value_counts_kolumna1

Marketing_Airline_DOT_ID
19805    402837
19790    330943
19977    286896
19393    285979
19930     87166
20409     59820
20416     49783
20436     33553
20368     28214
19690     16686
Name: count, dtype: int64

In [None]:
grouped_delays = grouped_delays.groupby(['Marketing_Airline', 'Marketing_Airline_DOT_ID'])


In [43]:
value_counts_kolumna1 = grouped_delays['Marketing_Airline'].value_counts()
value_counts_kolumna1

Marketing_Airline  Marketing_Airline_DOT_ID
AA                 19805                       362622
AS                 19930                        78497
B6                 20409                        53971
DL                 19790                       297585
F9                 20436                        30322
G4                 20368                        25360
HA                 19690                        14967
NK                 20416                        44663
UA                 19977                       257921
WN                 19393                       257359
Name: count, dtype: int64

In [45]:
value_counts_kolumna2 = grouped_delays['Marketing_Airline_DOT_ID'].value_counts()
value_counts_kolumna2


Marketing_Airline_DOT_ID
19805    402837
19790    330943
19977    286896
19393    285979
19930     87166
20409     59820
20416     49783
20436     33553
20368     28214
19690     16686
Name: count, dtype: int64