# 1. Bibliotecas

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn import preprocessing 

# 2. Dataset

In [3]:
data = pd.read_csv('airlines_delay.csv')

In [4]:
data.head()

Unnamed: 0,Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313.0,1296.0,141.0,DL,ATL,HOU,1,0
1,6948.0,360.0,146.0,OO,COS,ORD,4,0
2,1247.0,1170.0,143.0,B6,BOS,CLT,3,0
3,31.0,1410.0,344.0,US,OGG,PHX,6,0
4,563.0,692.0,98.0,FL,BMI,ATL,4,0


In [5]:
data_backup = data # O data_backup vai ser utilizado para trabalhar os gráficos

# 3. Feature Engineering 

In [6]:
labels = data['Class']
data = data.drop('Class', axis=1)
data = data.drop('Flight', axis=1) 

In [7]:
# Transformando os dados categóricos em numéricos

print("Airline: ", data['Airline'].unique())
print("Aiport from: ", data['AirportFrom'].unique())
print("Aiport to: ", data['AirportTo'].unique())

Airline:  ['DL' 'OO' 'B6' 'US' 'FL' 'WN' 'CO' 'AA' 'YV' 'EV' 'XE' '9E' 'OH' 'UA'
 'MQ' 'AS' 'F9' 'HA']
Aiport from:  ['ATL' 'COS' 'BOS' 'OGG' 'BMI' 'MSY' 'EWR' 'DFW' 'BWI' 'CRW' 'LGB' 'BIS'
 'CLT' 'IAH' 'LAX' 'JAX' 'SAV' 'CLE' 'FLL' 'SAN' 'BHM' 'ROC' 'DTW' 'STT'
 'AUS' 'DCA' 'PHX' 'EYW' 'IND' 'JFK' 'ORD' 'PBI' 'SFO' 'MIA' 'DSM' 'SLC'
 'PHL' 'BZN' 'GRB' 'MBS' 'SBA' 'TYS' 'MSP' 'DEN' 'SAT' 'BUF' 'RIC' 'SEA'
 'PDX' 'LAS' 'IAD' 'HNL' 'BDL' 'MOT' 'PSE' 'CPR' 'SNA' 'STL' 'CVG' 'PIT'
 'HSV' 'SGF' 'RDU' 'MEM' 'KOA' 'ELP' 'SJU' 'JAN' 'AEX' 'LGA' 'RSW' 'MDT'
 'GUC' 'MKE' 'CAE' 'GRR' 'FAR' 'LIT' 'OMA' 'BNA' 'EVV' 'RDD' 'OKC' 'ITO'
 'SJC' 'MCO' 'LBB' 'CSG' 'OAK' 'PHF' 'ABQ' 'SMF' 'FAY' 'ABI' 'MSO' 'MFE'
 'GEG' 'MSN' 'TPA' 'DAY' 'RNO' 'PVD' 'ALB' 'CHO' 'ONT' 'LIH' 'PSP' 'LAN'
 'LEX' 'XNA' 'GJT' 'CMH' 'GSO' 'PSC' 'SYR' 'AVL' 'MHT' 'GRK' 'MCI' 'TXK'
 'LRD' 'ABE' 'LWB' 'ERI' 'DAL' 'ANC' 'TUS' 'ROA' 'MOD' 'JNU' 'SBP' 'CDV'
 'TUL' 'FSD' 'FNT' 'BTV' 'FCA' 'GNV' 'RAP' 'MDW' 'FWA' 'BUR' 'PNS' 'RST'
 'HOU' 

In [8]:
# transformando os valores:

label_encoder = preprocessing.LabelEncoder()

data['Airline'] = label_encoder.fit_transform(data['Airline'])
data['AirportTo'] = label_encoder.fit_transform(data['AirportTo'])
data['AirportFrom'] = label_encoder.fit_transform(data['AirportFrom'])

In [9]:
data.head()

Unnamed: 0,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek
0,1296.0,141.0,5,16,129,1
1,360.0,146.0,12,65,208,4
2,1170.0,143.0,3,35,60,3
3,1410.0,344.0,14,203,217,6
4,692.0,98.0,8,32,16,4


In [10]:
data.columns

Index(['Time', 'Length', 'Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek'], dtype='object')

In [11]:
# Qual feature é mais importante para o estudo de ML? 

features_list = ('Time', 'Length', 'Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek')
features = (data)

k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(features, labels)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))

k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
print ('')
print ("Melhores features:")
print (k_best_features_final)


Melhores features:
{'Length': 12492.88502628086, 'AirportFrom': 2427.518370673628, 'DayOfWeek': 1244.9779779098708, 'Airline': 885.6581756790663, 'AirportTo': 183.81975943262412}


# 4. ML - Aplicação 

In [12]:
X = data
y = labels

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [14]:
clf = DecisionTreeClassifier()

In [15]:
clf = clf.fit(X_train, y_train)

In [16]:
y_pred = clf.predict(X_test)

In [17]:
# Verificando os resultados 

print("Acurácia: ", metrics.accuracy_score(y_test, y_pred))
print("ROC/AUC: ", metrics.roc_auc_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("Recall: ", metrics.recall_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred))

Acurácia:  0.6100855915706208
ROC/AUC:  0.5975126954677742
Precision:  0.5758224612422401
Recall:  0.48028520005537867
F1-Score:  0.5237326006219994


Os resultados poderiam ser melhores, acredito com um estudo de feature engineering possa melhorar esse valores. 