<a href="https://colab.research.google.com/github/pdrvieira/FlighDelaysProject/blob/main/On_Time_Flight_Arrivals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict flight delays by creating a Machine Learning Model  



In [None]:
# importing libraries
%matplotlib inline
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
sns.set_palette('Accent')

In [None]:
# importing a dataset
!curl https://topcs.blob.core.windows.net/public/FlightData.csv -o flightdata.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: topcs.blob.core.windows.net


In [None]:
# loading the dataset
df = pd.read_csv('flightdata.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'flightdata.csv'

> ## **Column Description**

<ul>
<li><b>YEAR</b> - Year that the flight took place</li>
<li><b>QUARTER</b>	- Quarter that the flight took place (1-4)</li>
<li><b>MONTH</b> - Month that the flight took place (1-12)</li>
<li><b>DAY_OF_MONTH</b> - Day of the month that the flight took place (1-31)</li>
<li><b>DAY_OF_WEEK</b>	- Day of the week that the flight took place (1=Monday, 2=Tuesday, etc.)</li>
<li><b>UNIQUE_CARRIER</b> - Airline carrier code (e.g., DL)</li>
<li><b>TAIL_NUM</b> - Aircraft tail number</li>
<li><b>FL_NUM</b> - Flight number</li>
<li><b>ORIGIN_AIRPORT_ID</b> - ID of the airport of origin</li>
<li><b>ORIGIN</b> - Origin airport code (ATL, DFW, SEA, etc.)</li>
<li><b>DEST_AIRPORT_ID</b> - ID of the destination airport</li>
<li><b>DEST</b> - Destination airport code (ATL, DFW, SEA, etc.)</li>
<li><b>CRS_DEP_TIME</b> - Scheduled departure time</li>
<li><b>DEP_TIME</b> - Actual departure time</li>
<li><b>DEP_DELAY</b> - Number of minutes departure was delayed</li>
<li><b>DEP_DEL15</b> - 0=Departure delayed less than 15 minutes, 1=Departure delayed 15 minutes or more</li>
<li><b>CRS_ARR_TIME</b> - Scheduled arrival time</li>
<li><b>ARR_TIME</b> - Actual arrival time</li>
<li><b>ARR_DELAY</b> - Number of minutes flight arrived late</li>
<li><b>ARR_DEL15</b> -  0=Arrived less than 15 minutes late, 1=Arrived 15 minutes or more late</li>
<li><b>CANCELLED</b> - 0=Flight was not cancelled, 1=Flight was cancelled</li>
<li><b>DIVERTED</b> - 0=Flight was not diverted, 1=Flight was diverted</li>
<li><b>CRS_ELAPSED_TIME</b> - Scheduled flight time in minutes</li>
<li><b>ACTUAL_ELAPSED_TIME</b> - Actual flight time in minutes</li>
<li><b>DISTANCE</b> - Distance traveled in miles</li> </ul>

## Clean and prepare data

In [None]:
print('Dimensionality of the DataFrame:')
print(f'Linhas:  {df.shape[0]}')
print(f'Colunas:  {df.shape[1]}')

In [None]:
df.info()

In [None]:
# checking for missing values
df.isnull().values.any()

In [None]:
# number of missing values in each column
df.isnull().sum()

In [None]:
# removing column 26
df.drop('Unnamed: 25', axis=1, inplace=True)
df.isnull().sum()

In [None]:
# the filtered DataFrame
df = df[['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'ARR_DEL15']]
df.isnull().sum()

In [None]:
# rows with missing values
df[df.isnull().values.any(axis=1)].head()

In [None]:
# NaNs replaced with 1s
df.fillna({'ARR_DEL15': 1}, inplace=True)
df.iloc[177:185]

In [None]:
# observe that the CRS_DEP_TIME column contains values from 0 to 2359 representing military times.
df.head()

In [None]:
# the DataFrame with binned departure times
for index, row in df.iterrows():
    df.loc[index, 'CRS_DEP_TIME'] = math.floor(row['CRS_DEP_TIME'] / 100)
df.head()

In [None]:
# descriptive statistics
df.describe()

## Graphical Analysis

In [None]:
# percentage of flights
# 0=Arrived less than 15 minutes late, 1=Arrived 15 minutes or more late
print('Flights Arrival:')
print(df['ARR_DEL15'].value_counts() / len(df['ARR_DEL15']))
ax = sns.countplot(x='ARR_DEL15', data=df)
ax.set_title('Flights Arrival')
ax;

In [None]:
# correlation of columns
corr = df.corr()
corr

## Data Preparation

In [None]:
# the DataFrame with indicator columns
df = pd.get_dummies(df, columns=['ORIGIN', 'DEST'])
df.head()

In [None]:
from sklearn.model_selection import train_test_split
SEED = 42
np.random.seed(SEED)

X = df.drop('ARR_DEL15', axis=1)
y = df['ARR_DEL15']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
print(f'X_train: {X_train.shape[0]}')
print(f'X_test: {X_test.shape[0]}')
print(f'y_train: {y_train.shape[0]}')
print(f'y_test: {y_test.shape[0]}')

## Build Machine Learning Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# training the model
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)

## Measure the accuracy

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

# testing the model
y_predict = model.predict(X_test)

accuracy = model.score(X_test, y_test)
print(f'[Accuracy] Random Forest: {accuracy}')

# generating an AUC score
probabilities = model.predict_proba(X_test)
roc_auc_score(y_test, probabilities[:, 1])
print(f'[Roc_Auc]: {roc_auc_score(y_test, probabilities[:, 1])}')

# generating a confusion matrix
pd.DataFrame(confusion_matrix(y_test, y_predict),
             index=['neg', 'pos'], columns=['pred_neg', 'pred_pos'])

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# measuring precision
train_predictions = model.predict(X_train)
print(f'[Precision Score]: {precision_score(y_train, train_predictions)}')

# measuring recall
print(f'[Recall]: {recall_score(y_train, train_predictions)}')

## Visualize Output of Model

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, probabilities[:, 1])
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
# function that calls the machine-learning model to compute the likelihood that a flight will be on time

def predict_delay(departure_date_time, origin, destination):
    from datetime import datetime

    try:
        departure_date_time_parsed = datetime.strptime(departure_date_time, '%d/%m/%Y %H:%M:%S')
    except ValueError as e:
        return 'Error parsing date/time - {}'.format(e)

    month = departure_date_time_parsed.month
    day = departure_date_time_parsed.day
    day_of_week = departure_date_time_parsed.isoweekday()
    hour = departure_date_time_parsed.hour

    origin = origin.upper()
    destination = destination.upper()

    input = [{'MONTH': month,
              'DAY': day,
              'DAY_OF_WEEK': day_of_week,
              'CRS_DEP_TIME': hour,
              'ORIGIN_ATL': 1 if origin == 'ATL' else 0,
              'ORIGIN_DTW': 1 if origin == 'DTW' else 0,
              'ORIGIN_JFK': 1 if origin == 'JFK' else 0,
              'ORIGIN_MSP': 1 if origin == 'MSP' else 0,
              'ORIGIN_SEA': 1 if origin == 'SEA' else 0,
              'DEST_ATL': 1 if destination == 'ATL' else 0,
              'DEST_DTW': 1 if destination == 'DTW' else 0,
              'DEST_JFK': 1 if destination == 'JFK' else 0,
              'DEST_MSP': 1 if destination == 'MSP' else 0,
              'DEST_SEA': 1 if destination == 'SEA' else 0 }]

    return print('Probability of On-Time Arrival: {}%'.format(model.predict_proba(pd.DataFrame(input))[0][0] * 100))

In [None]:
# creating simulations

predict_delay('1/10/2018 21:45:00', 'JFK', 'ATL')

In [None]:
predict_delay('2/10/2018 21:45:00', 'JFK', 'ATL')

In [None]:
predict_delay('2/10/2018 10:00:00', 'ATL', 'SEA')