In [1]:
import pandas as pd
data = pd.read_csv('data.csv')
data.drop("Unnamed: 0", axis=1, inplace=True)
data.drop("origin_city", axis=1, inplace=True)

In [16]:
# Split in train and test
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)

In [3]:
train

Unnamed: 0,city,has_delay,time_of_day,avg_temperature_2m,avg_relative_humidity_2m,total_precipitation,avg_cloud_cover
163,Paris,0,afternoon,13.895458,71.208333,0.0,60.083333
427,Berlin,1,afternoon,10.429521,80.000000,4.6,96.541667
720,Rome,1,evening,12.854333,89.125000,0.0,82.208333
266,Budapest,1,afternoon,9.587250,55.583333,0.0,75.375000
148,Paris,1,afternoon,13.895458,71.208333,0.0,60.083333
...,...,...,...,...,...,...,...
1130,Paris,0,afternoon,13.785146,73.583333,0.0,52.375000
1294,Athens,1,afternoon,9.587250,55.583333,0.0,75.375000
860,Barcelona,0,afternoon,14.584229,80.166667,0.0,88.583333
1459,London,1,afternoon,12.276812,75.250000,0.0,91.416667


In [4]:
test

Unnamed: 0,city,has_delay,time_of_day,avg_temperature_2m,avg_relative_humidity_2m,total_precipitation,avg_cloud_cover
1603,Paris,1,afternoon,13.152000,73.333333,0.7,50.875000
482,Paris,0,afternoon,13.895458,71.208333,0.0,60.083333
203,Paris,0,evening,13.895458,71.208333,0.0,60.083333
49,Paris,1,evening,13.895458,71.208333,0.0,60.083333
937,Paris,1,evening,13.785146,73.583333,0.0,52.375000
...,...,...,...,...,...,...,...
226,Budapest,0,morning,9.587250,55.583333,0.0,75.375000
231,Budapest,0,afternoon,9.587250,55.583333,0.0,75.375000
650,London,0,morning,10.070500,73.625000,5.9,72.750000
1511,Paris,0,afternoon,13.785146,73.583333,0.0,52.375000


In [26]:
# Create a SVM that predicts has_delay
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector as selector

# Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), selector(dtype_exclude="object")),
        ('cat', OneHotEncoder(), selector(dtype_include="object"))
    ])

# Grid search for the best hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma': [0.1, 1, 10, 100],
    'svc__kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(make_pipeline(preprocessor, SVC()), param_grid, cv=5, n_jobs=-1)
grid_search.fit(train.drop('has_delay', axis=1), train['has_delay'])

# Evaluate the model
from sklearn.metrics import classification_report

print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)
print("Test score: ", grid_search.score(test.drop('has_delay', axis=1), test['has_delay']))
print(classification_report(test['has_delay'], grid_search.predict(test.drop('has_delay', axis=1))))    

# Save the model
import joblib
joblib.dump(grid_search.best_estimator_, 'best_model_svm.pkl')

# Load the model
model = joblib.load('best_model_svm.pkl')
print(classification_report(test['has_delay'], model.predict(test.drop('has_delay', axis=1))))

Best parameters found:  {'svc__C': 1, 'svc__gamma': 10, 'svc__kernel': 'rbf'}
Best score found:  0.6071294626758024
Test score:  0.5992063492063492
              precision    recall  f1-score   support

           0       0.61      0.61      0.61       257
           1       0.59      0.58      0.59       247

    accuracy                           0.60       504
   macro avg       0.60      0.60      0.60       504
weighted avg       0.60      0.60      0.60       504

              precision    recall  f1-score   support

           0       0.61      0.61      0.61       257
           1       0.59      0.58      0.59       247

    accuracy                           0.60       504
   macro avg       0.60      0.60      0.60       504
weighted avg       0.60      0.60      0.60       504



In [25]:
# Create a NN that predicts has_delay

from sklearn.neural_network import MLPClassifier

# Grid search for the best hyperparameters
param_grid = {
    'mlpclassifier__hidden_layer_sizes': [(4,), (8,), (2,2), (4,4), (5,5), (10,), (10,10), (20,), (20,20)],
    'mlpclassifier__activation': ['logistic', 'tanh', 'relu'],
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01]
}

grid_search = GridSearchCV(make_pipeline(preprocessor, MLPClassifier(max_iter=1000)), param_grid, cv=5, n_jobs=-1)
grid_search.fit(train.drop('has_delay', axis=1), train['has_delay'])

# Evaluate the model
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)
print("Test score: ", grid_search.score(test.drop('has_delay', axis=1), test['has_delay']))
print(classification_report(test['has_delay'], grid_search.predict(test.drop('has_delay', axis=1))))

# Save the best model
import joblib
joblib.dump(grid_search.best_estimator_, 'best_model_mlp.pkl')

# Load the best model
model = joblib.load('best_model_mlp.pkl')
print(model.predict(test.drop('has_delay', axis=1)))

Best parameters found:  {'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.0001, 'mlpclassifier__hidden_layer_sizes': (20, 20)}
Best score found:  0.5875694194013704
Test score:  0.5734126984126984
              precision    recall  f1-score   support

           0       0.59      0.54      0.56       257
           1       0.56      0.61      0.58       247

    accuracy                           0.57       504
   macro avg       0.57      0.57      0.57       504
weighted avg       0.57      0.57      0.57       504

[0 1 1 1 1 1 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 1 1 0 1 1
 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0
 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1
 0 0 0 1 0 1 1 1 0 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 1 1 1 1
 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 0 0
 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0
 1 0 0 1 0 1

In [24]:
#create a decision tree
from sklearn.tree import DecisionTreeClassifier
#Grid search for the best hyperparameters
param_grid = {
    'decisiontreeclassifier__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'decisiontreeclassifier__criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(make_pipeline(preprocessor, DecisionTreeClassifier()), param_grid, cv=5, n_jobs=-1)
grid_search.fit(train.drop('has_delay', axis=1), train['has_delay'])

# Evaluate the model
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)
print("Test score: ", grid_search.score(test.drop('has_delay', axis=1), test['has_delay']))
print(classification_report(test['has_delay'], grid_search.predict(test.drop('has_delay', axis=1))))

# Save the best model
import joblib
joblib.dump(grid_search, 'best_model_dt.pkl')

# Load the best model
best_model = joblib.load('best_model_dt.pkl')
print(best_model.score(test.drop('has_delay', axis=1), test['has_delay']))

Best parameters found:  {'decisiontreeclassifier__criterion': 'entropy', 'decisiontreeclassifier__max_depth': 9}
Best score found:  0.5994590695997115
Test score:  0.5972222222222222
              precision    recall  f1-score   support

           0       0.63      0.52      0.57       257
           1       0.58      0.68      0.62       247

    accuracy                           0.60       504
   macro avg       0.60      0.60      0.60       504
weighted avg       0.60      0.60      0.59       504

0.5972222222222222
