In [12]:
# importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
# reading the .csv file
flights = pd.read_csv('flights.csv', low_memory = False)

In [5]:
# sample of data having 100000 rows using Pandas Library
sample = flights.iloc[:100000,:]

In [6]:
# getting the info of the sample data
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YEAR                 100000 non-null  int64  
 1   MONTH                100000 non-null  int64  
 2   DAY                  100000 non-null  int64  
 3   DAY_OF_WEEK          100000 non-null  int64  
 4   AIRLINE              100000 non-null  object 
 5   FLIGHT_NUMBER        100000 non-null  int64  
 6   TAIL_NUMBER          99833 non-null   object 
 7   ORIGIN_AIRPORT       100000 non-null  object 
 8   DESTINATION_AIRPORT  100000 non-null  object 
 9   SCHEDULED_DEPARTURE  100000 non-null  int64  
 10  DEPARTURE_TIME       97702 non-null   float64
 11  DEPARTURE_DELAY      97702 non-null   float64
 12  TAXI_OUT             97629 non-null   float64
 13  WHEELS_OFF           97629 non-null   float64
 14  SCHEDULED_TIME       100000 non-null  float64
 15  ELAPSED_TIME      

In [7]:
# checking the number of 'DIVERTED' flights in the sample
sample['DIVERTED'].value_counts()

0    99776
1      224
Name: DIVERTED, dtype: int64

In [11]:
# correlation of other features with resepct to 'ARRIVAL_DELAY'
sample.corr()['ARRIVAL_DELAY'][1:].sort_values(ascending = False)

ARRIVAL_DELAY          1.000000
DEPARTURE_DELAY        0.950838
AIRLINE_DELAY          0.592718
LATE_AIRCRAFT_DELAY    0.572956
AIR_SYSTEM_DELAY       0.259700
TAXI_OUT               0.245363
WEATHER_DELAY          0.235906
DEPARTURE_TIME         0.223654
WHEELS_OFF             0.217344
TAXI_IN                0.170073
SCHEDULED_DEPARTURE    0.154951
SCHEDULED_ARRIVAL      0.140565
WHEELS_ON              0.088131
ARRIVAL_TIME           0.076791
DAY                    0.070770
DAY_OF_WEEK            0.067520
FLIGHT_NUMBER          0.056163
ELAPSED_TIME           0.048448
SECURITY_DELAY         0.006070
AIR_TIME              -0.002742
SCHEDULED_TIME        -0.022043
DISTANCE              -0.023821
MONTH                       NaN
DIVERTED                    NaN
CANCELLED                   NaN
Name: ARRIVAL_DELAY, dtype: float64

In [13]:
# checking for null values
sample.isnull().sum().sort_values(ascending = False)

CANCELLATION_REASON    97611
WEATHER_DELAY          65375
AIRLINE_DELAY          65375
SECURITY_DELAY         65375
AIR_SYSTEM_DELAY       65375
LATE_AIRCRAFT_DELAY    65375
ARRIVAL_DELAY           2613
AIR_TIME                2613
ELAPSED_TIME            2613
ARRIVAL_TIME            2440
TAXI_IN                 2440
WHEELS_ON               2440
WHEELS_OFF              2371
TAXI_OUT                2371
DEPARTURE_DELAY         2298
DEPARTURE_TIME          2298
TAIL_NUMBER              167
SCHEDULED_TIME             0
CANCELLED                  0
DIVERTED                   0
DISTANCE                   0
SCHEDULED_ARRIVAL          0
SCHEDULED_DEPARTURE        0
DESTINATION_AIRPORT        0
ORIGIN_AIRPORT             0
FLIGHT_NUMBER              0
AIRLINE                    0
DAY_OF_WEEK                0
DAY                        0
MONTH                      0
YEAR                       0
dtype: int64

In [14]:
# dropping various columns
drp_cols = ['YEAR', 'FLIGHT_NUMBER', 'AIRLINE', 'DISTANCE', 'TAIL_NUMBER', 'TAXI_OUT', 'SCHEDULED_TIME', 
            'DEPARTURE_TIME','WHEELS_OFF', 'ELAPSED_TIME', 'AIR_TIME', 'WHEELS_ON', 'DAY_OF_WEEK', 
            'TAXI_IN', 'ARRIVAL_TIME', 'CANCELLATION_REASON']

sample1 = sample.drop(drp_cols, axis = 1)
sample1.shape

(100000, 15)

In [16]:
sample1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   MONTH                100000 non-null  int64  
 1   DAY                  100000 non-null  int64  
 2   ORIGIN_AIRPORT       100000 non-null  object 
 3   DESTINATION_AIRPORT  100000 non-null  object 
 4   SCHEDULED_DEPARTURE  100000 non-null  int64  
 5   DEPARTURE_DELAY      97702 non-null   float64
 6   SCHEDULED_ARRIVAL    100000 non-null  int64  
 7   ARRIVAL_DELAY        97387 non-null   float64
 8   DIVERTED             100000 non-null  int64  
 9   CANCELLED            100000 non-null  int64  
 10  AIR_SYSTEM_DELAY     34625 non-null   float64
 11  SECURITY_DELAY       34625 non-null   float64
 12  AIRLINE_DELAY        34625 non-null   float64
 13  LATE_AIRCRAFT_DELAY  34625 non-null   float64
 14  WEATHER_DELAY        34625 non-null   float64
dtypes: float64(7), int

In [17]:
# filling the missing values
sample1 = sample1.fillna(sample1.mean())

In [19]:
sample1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   MONTH                100000 non-null  int64  
 1   DAY                  100000 non-null  int64  
 2   ORIGIN_AIRPORT       100000 non-null  object 
 3   DESTINATION_AIRPORT  100000 non-null  object 
 4   SCHEDULED_DEPARTURE  100000 non-null  int64  
 5   DEPARTURE_DELAY      100000 non-null  float64
 6   SCHEDULED_ARRIVAL    100000 non-null  int64  
 7   ARRIVAL_DELAY        100000 non-null  float64
 8   DIVERTED             100000 non-null  int64  
 9   CANCELLED            100000 non-null  int64  
 10  AIR_SYSTEM_DELAY     100000 non-null  float64
 11  SECURITY_DELAY       100000 non-null  float64
 12  AIRLINE_DELAY        100000 non-null  float64
 13  LATE_AIRCRAFT_DELAY  100000 non-null  float64
 14  WEATHER_DELAY        100000 non-null  float64
dtypes: float64(7), int

In [20]:
# creating a new feature 'RESULT'
RESULT = []

for r in sample1['ARRIVAL_DELAY']:
    if r > 15 :
        RESULT.append(1)
    else :
        RESULT.append(0)

In [22]:
sample1['RESULT'] = RESULT

In [23]:
sample1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   MONTH                100000 non-null  int64  
 1   DAY                  100000 non-null  int64  
 2   ORIGIN_AIRPORT       100000 non-null  object 
 3   DESTINATION_AIRPORT  100000 non-null  object 
 4   SCHEDULED_DEPARTURE  100000 non-null  int64  
 5   DEPARTURE_DELAY      100000 non-null  float64
 6   SCHEDULED_ARRIVAL    100000 non-null  int64  
 7   ARRIVAL_DELAY        100000 non-null  float64
 8   DIVERTED             100000 non-null  int64  
 9   CANCELLED            100000 non-null  int64  
 10  AIR_SYSTEM_DELAY     100000 non-null  float64
 11  SECURITY_DELAY       100000 non-null  float64
 12  AIRLINE_DELAY        100000 non-null  float64
 13  LATE_AIRCRAFT_DELAY  100000 non-null  float64
 14  WEATHER_DELAY        100000 non-null  float64
 15  RESULT            

In [24]:
# to check number of flights delayed by more than 15 mins
sample1['RESULT'].value_counts()

0    63779
1    36221
Name: RESULT, dtype: int64

In [26]:
# Splitting the data into X and Y

x = sample1[['MONTH', 'DAY',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL',
        'DIVERTED', 'CANCELLED', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY']].values
y = sample1['RESULT'].values
y = y.astype('int')

In [27]:
# splitting the data into train and test

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [30]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
scalar.fit(x_train)
x_train = scalar.transform(x_train)
x_test = scalar.transform(x_test)

In [31]:
# Model building

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = 'gini')
clf.fit(x_train, y_train)

DecisionTreeClassifier()

In [32]:
predicted = clf.predict(x_test)

In [38]:
# calculating auc and roc score

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]))

0.9981684823083186
