In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as r2
from sklearn.linear_model import LinearRegression

In [2]:
dades = pd.read_csv("../Sprint 11/NouFitxer.csv")

In [3]:
scaler = MinMaxScaler()

scaled = scaler.fit_transform(dades[["ArrDelay", "DepDelay", "AvSpeed", "DayOfWeek"]])

scaled_df = pd.DataFrame(scaled, columns = ["ArrDelay", "DepDelay", "AvSpeed", "DayOfWeek"])

In [4]:
y = scaled_df["ArrDelay"]
X = scaled_df[["DepDelay", "AvSpeed", "DayOfWeek"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)

_________________________________________________________________________________________________________

Entrena’ls utilitzant els diferents paràmetres que admeten.

In [5]:
# First model: Polynomial regression instead of OLS (since not much change in hyperparameters is allowed)
# No hyperparameter tuning baseline: 
    # R^2 score: 0,907
    # RMSE score: 2,48%
    # Execution time: 1s

In [6]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.fit_transform(X_test)


In [7]:
regression = LinearRegression()

model = regression.fit(X_poly_train, y_train)

y_pred = regression.predict(X_poly_test)

# Execute time: under 1 second

In [8]:
rmse = np.sqrt(mse(y_test, y_pred))
r2 = r2(y_test, y_pred)

print(rmse)
print(r2)

0.007495683537139274
0.9087676883046029


In [9]:
# Using polynomial regression we get slightly better results:
    # R^2 score: 0,908 (vs 0,907)
    # RMSE score: 0,75% (vs 2,48%)
    # Execution time: under a second (vs 1 second)

_________________________________________________________________________________________________________

In [10]:
# Second model: Neural Networks; changes in number of neurons and maximum iterations required
# No hyperparameter tuning baseline: 
    # R^2 score: 0,913
    # RMSE score: 0,73%
    # Execution time: 2m 30s

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4)

neural = MLPRegressor(hidden_layer_sizes = 75, max_iter = 100, random_state=0)

In [12]:
neural.fit(X_train, y_train)

# Execution time: 1min 20s

MLPRegressor(hidden_layer_sizes=75, max_iter=100, random_state=0)

In [13]:
Y_preds = neural.predict(X_test)

In [14]:
# R^2 score:

print(neural.score(X_train, y_train))
print(neural.score(X_test, y_test))

0.8906037656924561
0.8922146478018207


In [15]:
# RSME score:

print(np.sqrt(mse(y_test, Y_preds)))


0.00817139484898527


In [16]:
# This hyperparameter tuning has reduced hidden layers and maximum iterations, which has made the model more 
# efficient yet slightly less effective

    # R^2 score: 0,892 (vs 0,913)
    # RMSE score: 0,82% (vs 0,73%)
    # Execution time: 1m 20s (vs 2m 30s)

________________________________________________________________________________________________________

In [17]:
# Third model: Random Forest; changes in number of estimators and bootstrap settings
# No hyperparameter tuning baseline: 
    # R^2 score: 0,898
    # RMSE score: 3,44%
    # Execution time: 8m

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 58)

rf = RandomForestRegressor(n_estimators = 75, bootstrap = False, random_state = 42)

rf.fit(X_train, y_train)

# Execution time: 8m 15s

RandomForestRegressor(bootstrap=False, n_estimators=75, random_state=42)

In [21]:
Y_pred = rf.predict(X_test)

# Execution time: 1m

In [22]:
rf.feature_importances_

# Once again, Departure Delay is the most important variable to predict Arrival Delay

array([0.92381012, 0.0650429 , 0.01114698])

In [23]:
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

# Execution time: 3 min

0.9946491647313597
0.8353662126794538


In [24]:
print(np.sqrt(mse(y_test, Y_preds)))

0.0342027985022044


In [27]:
# After tuning, random forest keeps being the worst performing model, and has seen no substantial improvements
    # R^2 score: 0,835 (vs 0,898)
    # RMSE score: 3,42 (vs 3,44%)
    # Execution time: 8m 15s (vs 8m)

________________________________________________________________________________________________________

Compara el seu rendiment utilitzant l’aproximació traint/test o utilitzant totes les dades (validació interna)

In [25]:
# L'aproximació train/test ha estat utilitzada en tots els models, així com totes les dades del dataset

________________________________________________________________________________________________________

In [26]:
# This exercise continues, due to size concerns, at S12 T01 P3