# Delays prediction - machine learning models (regression)

Katarzyna Mocio 429956 
Marcin Miszkiel 432418

# 1. Prepare necessary libraries


In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option("display.max_columns",100)

# Data wrangling
import numpy as np
from datetime import datetime as dt 

# Visualizations
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

# Modelling with scikit-learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import optuna
import sklearn



Delays for regression
Your task is to apply various ML algorithms (see the rules below) to build a model explaining the delays of the flights based on the training sample and generate predictions for all observations from the test sample.

The dataset includes 1631327 observations in the training sample and 407832 in the test sample and the following columns:

Weekday – Day of the week when the flight occurred (1 for Sunday, 7 for Saturday).
Month_of_Year – The numerical month (1-12) when the flight took place.
Day_of_Month – The day of the month (1-31) when the flight occurred.
Scheduled_Departure_Time – The scheduled local time of flight departure.
Scheduled_Arrival_Time – The scheduled local time of flight arrival.
Marketing_Airline – The airline code under which the flight was marketed.
Marketing_Airline_DOT_ID – Department of Transportation identifier for the marketing airline.
Flight_Number – The number assigned to the flight by the operating airline.
Origin_Airport_ID – Unique identifier for the departure airport.
Destination_Airport_ID – Unique identifier for the arrival airport.
Flight_Cancelled – Indicator of whether the flight was cancelled (1 = Yes, 0 = No).
Departure_State – The state code of the departure location.
Arrival_State – The state code of the arrival location.
Departure_Delay – Total delay in minutes at departure.
Diverted_Airport_Landings – Count of unplanned landings at other airports.
Taxi_Out_Time – Time in minutes from gate departure until takeoff.
Taxi_In_Time – Time in minutes from landing to gate arrival.
Flight_Diverted – Indicator of whether the flight was diverted (1 = Yes, 0 = No).
Actual_Departure_Time – The actual local time the flight departed.
Flight_Duration – The duration of the flight in minutes from takeoff to landing.
Flight_Distance – The total distance covered by the flight in miles.
Origin_Temperature – The temperature at the origin airport at the time of the flight’s departure.
Destination_Temperature – The temperature at the destination airport at the time of the flight’s arrival.
Origin_Wind_Speed – The wind speed at the origin airport during the departure of the flight.
Destination_Wind_Speed – The wind speed at the destination airport during the departure of the flight.
Origin_Precipitation – The amount of precipitation, such as rain or snow, at the origin airport around the flight’s departure time.
Destination_Precipitation – The amount of precipitation, such as rain or snow, at the destination airport around the flight’s arrival time.
Arrival_Delay – Total delay in minutes at arrival (outcome variable, only in the training sample)

Let's load the data and look at the first five observations

In [18]:
delays = pd.read_csv('delays_train.csv')

delays.head()

Unnamed: 0,Weekday,Month_of_Year,Day_of_Month,Scheduled_Departure_Time,Scheduled_Arrival_Time,Marketing_Airline,Marketing_Airline_DOT_ID,Flight_Number,Origin_Airport_ID,Destination_Airport_ID,Flight_Cancelled,Departure_State,Arrival_State,Departure_Delay,Arrival_Delay,Diverted_Airport_Landings,Taxi_Out_Time,Taxi_In_Time,Flight_Diverted,Actual_Departure_Time,Flight_Duration,Flight_Distance,Origin_Temperature,Destination_Temperature,Origin_Wind_Speed,Destination_Wind_Speed,Origin_Precipitation,Destination_Precipitation
0,6,6,25,1222,1444,B6,20409,520,10397,12478.0,False,GA,NY,,,0,16.0,10.0,False,1224.0,107.0,760.0,25.14022,19.357739,21.019808,15.452723,0.0,0.0
1,2,6,21,1216,1304,,19805,6297,14107,15376.0,False,AZ,AZ,0.088687,-4.178483,0,16.0,5.0,False,1216.0,23.0,110.0,13.279939,20.47069,18.045064,12.910265,0.0,0.0
2,3,1,5,1945,2055,DL,19790,4124,13487,13076.0,False,MN,WI,,5.042185,0,34.0,5.0,False,1945.0,36.0,119.0,17.816202,13.967273,21.606228,17.976362,0.0,0.1
3,2,3,22,700,924,AA,19805,1538,15624,11298.0,False,FL,TX,-1.802698,-0.206932,0,10.0,11.0,False,658.0,125.0,641.0,24.562566,14.509228,24.946489,22.630553,0.0,0.0
4,4,7,14,2130,2359,,19930,1116,14747,12889.0,False,WA,NV,,14.006092,0,23.0,7.0,False,2133.0,130.0,867.0,8.817992,10.866812,17.426336,17.401007,0.0,0.0


In [3]:
delays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1631327 entries, 0 to 1631326
Data columns (total 28 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Weekday                    1631327 non-null  int64  
 1   Month_of_Year              1631327 non-null  int64  
 2   Day_of_Month               1631327 non-null  int64  
 3   Scheduled_Departure_Time   1631327 non-null  int64  
 4   Scheduled_Arrival_Time     1631327 non-null  int64  
 5   Marketing_Airline          1467776 non-null  object 
 6   Marketing_Airline_DOT_ID   1631327 non-null  int64  
 7   Flight_Number              1631327 non-null  int64  
 8   Origin_Airport_ID          1631327 non-null  int64  
 9   Destination_Airport_ID     1468233 non-null  float64
 10  Flight_Cancelled           1631327 non-null  bool   
 11  Departure_State            1631327 non-null  object 
 12  Arrival_State              1631327 non-null  object 
 13  Departure_De

In [4]:
an = pd.read_csv('airline_name.csv', index_col=False)
an = an.set_index('id')['name'].to_dict()
an

{19805: 'AA',
 19930: 'AS',
 20409: 'B6',
 19790: 'DL',
 20436: 'F9',
 20368: 'G4',
 19690: 'HA',
 20416: 'NK',
 19977: 'UA',
 19393: 'WN'}

In [5]:
delays['Marketing_Airline'] = delays['Marketing_Airline_DOT_ID'].map(an)

In [6]:
delays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1631327 entries, 0 to 1631326
Data columns (total 28 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Weekday                    1631327 non-null  int64  
 1   Month_of_Year              1631327 non-null  int64  
 2   Day_of_Month               1631327 non-null  int64  
 3   Scheduled_Departure_Time   1631327 non-null  int64  
 4   Scheduled_Arrival_Time     1631327 non-null  int64  
 5   Marketing_Airline          1631327 non-null  object 
 6   Marketing_Airline_DOT_ID   1631327 non-null  int64  
 7   Flight_Number              1631327 non-null  int64  
 8   Origin_Airport_ID          1631327 non-null  int64  
 9   Destination_Airport_ID     1468233 non-null  float64
 10  Flight_Cancelled           1631327 non-null  bool   
 11  Departure_State            1631327 non-null  object 
 12  Arrival_State              1631327 non-null  object 
 13  Departure_De

In [7]:
delays['Scheduled_Departure_Time'] = delays['Scheduled_Departure_Time'].astype(str).str.zfill(4)
delays['Actual_Departure_Time'] = delays['Actual_Departure_Time'].astype(int).astype(str).str.zfill(4)

delays['sh_h'] = delays['Scheduled_Departure_Time'].str.slice(start=0, stop=2)
delays['sh_min'] = delays['Scheduled_Departure_Time'].str.slice(start=2, stop=4)
delays['loc_h'] = delays['Actual_Departure_Time'].str.slice(start=0, stop=2)
delays['loc_min'] = delays['Actual_Departure_Time'].str.slice(start=2, stop=4)

delays

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
delays[['sh_h', 'sh_min', 'loc_h', 'loc_min']] = delays[['sh_h', 'sh_min', 'loc_h', 'loc_min']].astype(int)

delays['calculated_delay'] = (delays['loc_h'] - delays['sh_h']) * 60 + (delays['loc_min'] - delays['sh_min'])

delays['Departure_Delay'] = delays['Departure_Delay'].fillna(delays['calculated_delay'])

delays

In [None]:
states = pd.read_csv('timechange.csv', index_col=False)
states

In [None]:
delays = pd.merge(delays, states, how='left', left_on='Departure_State', right_on='Stan')
delays.rename(columns={'Strefa': 'StrefaD'}, inplace=True)
delays.drop(columns=['Stan'], inplace=True)

delays = pd.merge(delays, states, how='left', left_on='Arrival_State', right_on='Stan')
delays.rename(columns={'Strefa': 'StrefaA'}, inplace=True)
delays.drop(columns=['Stan'], inplace=True)

delays

In [None]:
delays['Scheduled_Arrival_Time'] = delays['Scheduled_Arrival_Time'].astype(str).str.zfill(4)

delays['arr_h'] = delays['Scheduled_Arrival_Time'].str.slice(start=0, stop=2)
delays['arr_min'] = delays['Scheduled_Arrival_Time'].str.slice(start=2, stop=4)

delays

In [None]:
delays.info()

In [None]:
delays2 = delays[delays['Destination_Airport_ID'].isnull()]
delays2.info()

In [None]:
delays3 = delays[delays['Taxi_In_Time'].isnull()]
delays3.info()

In [8]:
grouped_df = delays.groupby(['Marketing_Airline', 'Flight_Number', 'Origin_Airport_ID', 'Destination_Airport_ID'])

In [9]:
value_counts = grouped_df['Marketing_Airline'].value_counts()
value_counts

Marketing_Airline  Flight_Number  Origin_Airport_ID  Destination_Airport_ID
AA                 1              12478              12892.0                   77
                   2              12892              12478.0                   76
                   3              12478              12892.0                   83
                   4              12892              12478.0                   74
                   5              11298              12173.0                    8
                                                                               ..
WN                 6930           12889              13495.0                    2
                   6931           13495              12889.0                    2
                   6948           10821              10529.0                    1
                   6951           13204              10397.0                    1
                   6985           13204              10821.0                    1
Name: count, Length: 1

In [10]:
df = value_counts.reset_index()
df.columns = ['Marketing_Airline', 'Flight_Number', 'Origin_Airport_ID', 'Destination_Airport_ID', 'Count']

# Zapis do pliku Excel
df.to_excel('flights_count.xlsx', index=False)


In [11]:
airportD = pd.read_excel('flights_count.xlsx', index_col=False)
airportD

Unnamed: 0,Marketing_Airline,Flight_Number,Origin_Airport_ID,Destination_Airport_ID,Count
0,AA,1,12478,12892,77
1,AA,2,12892,12478,76
2,AA,3,12478,12892,83
3,AA,4,12892,12478,74
4,AA,5,11298,12173,8
...,...,...,...,...,...
105490,WN,6930,12889,13495,2
105491,WN,6931,13495,12889,2
105492,WN,6948,10821,10529,1
105493,WN,6951,13204,10397,1


In [12]:
delays['key'] = delays['Marketing_Airline'] + '_' + delays['Flight_Number'].astype(str) + '_' + delays['Origin_Airport_ID'].astype(str)
airportD['key'] = airportD['Marketing_Airline'] + '_' + airportD['Flight_Number'].astype(str) + '_' + airportD['Origin_Airport_ID'].astype(str)

In [13]:
airportD.to_excel('flights_count.xlsx', index=False)

In [14]:
# Stworzenie słownika mapowania
map_dict = airportD.set_index('key')['Destination_Airport_ID'].to_dict()

# Uzupełnianie brakujących wartości w 'delays'
delays['Destination_Airport_ID'] = delays['Destination_Airport_ID'].fillna(delays['key'].map(map_dict))
delays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1631327 entries, 0 to 1631326
Data columns (total 29 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Weekday                    1631327 non-null  int64  
 1   Month_of_Year              1631327 non-null  int64  
 2   Day_of_Month               1631327 non-null  int64  
 3   Scheduled_Departure_Time   1631327 non-null  object 
 4   Scheduled_Arrival_Time     1631327 non-null  int64  
 5   Marketing_Airline          1631327 non-null  object 
 6   Marketing_Airline_DOT_ID   1631327 non-null  int64  
 7   Flight_Number              1631327 non-null  int64  
 8   Origin_Airport_ID          1631327 non-null  int64  
 9   Destination_Airport_ID     1629511 non-null  float64
 10  Flight_Cancelled           1631327 non-null  bool   
 11  Departure_State            1631327 non-null  object 
 12  Arrival_State              1631327 non-null  object 
 13  Departure_De

In [16]:
delays = delays[delays['Flight_Cancelled'] == False]
delays.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1581877 entries, 0 to 1631326
Data columns (total 29 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Weekday                    1581877 non-null  int64  
 1   Month_of_Year              1581877 non-null  int64  
 2   Day_of_Month               1581877 non-null  int64  
 3   Scheduled_Departure_Time   1581877 non-null  object 
 4   Scheduled_Arrival_Time     1581877 non-null  int64  
 5   Marketing_Airline          1581877 non-null  object 
 6   Marketing_Airline_DOT_ID   1581877 non-null  int64  
 7   Flight_Number              1581877 non-null  int64  
 8   Origin_Airport_ID          1581877 non-null  int64  
 9   Destination_Airport_ID     1580202 non-null  float64
 10  Flight_Cancelled           1581877 non-null  bool   
 11  Departure_State            1581877 non-null  object 
 12  Arrival_State              1581877 non-null  object 
 13  Departure_Delay  

In [19]:
cancel = delays[delays['Flight_Cancelled'] == True]
cancel.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49450 entries, 30 to 1631308
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Weekday                    49450 non-null  int64  
 1   Month_of_Year              49450 non-null  int64  
 2   Day_of_Month               49450 non-null  int64  
 3   Scheduled_Departure_Time   49450 non-null  int64  
 4   Scheduled_Arrival_Time     49450 non-null  int64  
 5   Marketing_Airline          44509 non-null  object 
 6   Marketing_Airline_DOT_ID   49450 non-null  int64  
 7   Flight_Number              49450 non-null  int64  
 8   Origin_Airport_ID          49450 non-null  int64  
 9   Destination_Airport_ID     44472 non-null  float64
 10  Flight_Cancelled           49450 non-null  bool   
 11  Departure_State            49450 non-null  object 
 12  Arrival_State              49450 non-null  object 
 13  Departure_Delay            983 non-null    float

In [20]:
cancel2 = cancel[cancel['Destination_Airport_ID'].isnull()]
cancel2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4978 entries, 284 to 1631073
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Weekday                    4978 non-null   int64  
 1   Month_of_Year              4978 non-null   int64  
 2   Day_of_Month               4978 non-null   int64  
 3   Scheduled_Departure_Time   4978 non-null   int64  
 4   Scheduled_Arrival_Time     4978 non-null   int64  
 5   Marketing_Airline          4466 non-null   object 
 6   Marketing_Airline_DOT_ID   4978 non-null   int64  
 7   Flight_Number              4978 non-null   int64  
 8   Origin_Airport_ID          4978 non-null   int64  
 9   Destination_Airport_ID     0 non-null      float64
 10  Flight_Cancelled           4978 non-null   bool   
 11  Departure_State            4978 non-null   object 
 12  Arrival_State              4978 non-null   object 
 13  Departure_Delay            101 non-null    float

In [None]:
delays2