In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [3]:
#  Import and read the flights_data.csv.
flights_df = pd.read_csv("flights_data_df.csv")
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,Cancelled,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,114.0,,,,,,
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,117.0,,,,,,
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,121.0,,,,,,
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,121.0,,,,,,
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,135.0,,,,,,


### Data Preprocessing

In [4]:
# Creating 'Is_Delayed' column that will tell us if a flight is delayed or not. 
flights_df["Is_Delayed"] = None
flights_df.loc[flights_df["DepDelayMinutes"] == 0, "Is_Delayed"] = "1"
flights_df.loc[flights_df["DepDelayMinutes"] > 0, "Is_Delayed"] = "0"


In [5]:
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,Is_Delayed
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,114.0,,,,,,,0
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,117.0,,,,,,,1
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,121.0,,,,,,,0
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,117.0,121.0,,,,,,,1
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,117.0,135.0,,,,,,,1


In [16]:
# Converting the categorical variables into indicator variables. 
flights = pd.get_dummies(flights_df, columns=['Airline'])

In [17]:
flights.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Origin,OriginCityName,Dest,DestCityName,CRSDepTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,0,2021,1,2,6,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
1,1,2021,1,3,7,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
2,2,2021,1,4,1,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
3,3,2021,1,7,4,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0
4,4,2021,1,8,5,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0


In [23]:
# cutting out a segment for the model
flights_data_segment = flights[0:100000]

In [27]:
flights_data_segment.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Origin,OriginCityName,Dest,DestCityName,CRSDepTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,0,2021,1,2,6,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
1,1,2021,1,3,7,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
2,2,2021,1,4,1,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
3,3,2021,1,7,4,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0
4,4,2021,1,8,5,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0


In [32]:
#Drop unnecessary columns
flights_df_segment = flights_data_segment.drop(['Origin', 'OriginCityName', 'DestCityName', 'Dest', 'Unnamed: 0', 'Cancelled', 'CarrierDelay', 'WeatherDelay', "NASDelay", 'SecurityDelay', 'LateAircraftDelay', 'FirstDepTime', 'CancellationCode'], axis=1)

In [37]:
# replacing all NaN values with the mean of the attribute in which they are present
flights_df_segment=flights_df_segment.fillna(flights_df_segment.mean())

  


### Splitting Data

In [38]:
# Features and target arrays
y = flights_df_segment.Is_Delayed
X = flights_df_segment.drop(columns=["Is_Delayed"])

In [39]:
# Splitting data for training and testing (splitting in the ratio 70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) 

In [40]:
scaled_features = StandardScaler().fit_transform(X_train, X_test)


### Applying Decision Tree Classifier on Training Data

In [41]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

TypeError: '<' not supported between instances of 'str' and 'NoneType'

### Making Predictions and Checking Accuracy

In [None]:
pred_prob = clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob[:,1])
auc_score