In [6]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [7]:
import pandas as pd
import numpy as np

In [8]:
#Generate the dataset
df = pd.read_csv('data/flights_train.csv.bz2')
df.head()

Unnamed: 0,flight_date,from,to,avg_weeks,target,std_weeks
0,2012-06-19,ORD,DFW,12.875,12.331296,9.812647
1,2012-09-10,LAS,DEN,14.285714,10.775182,9.466734
2,2012-10-05,DEN,LAX,10.863636,11.083177,9.035883
3,2011-10-09,ATL,ORD,11.48,11.169268,7.990202
4,2012-02-21,DEN,SFO,11.45,11.269364,9.517159


In [9]:
#Type of the features
df.dtypes

flight_date     object
from            object
to              object
avg_weeks      float64
target         float64
std_weeks      float64
dtype: object

In [10]:
#Encode categorical variables
df = pd.get_dummies(data=df, columns=['from', 'to'])
df.head()

Unnamed: 0,flight_date,avg_weeks,target,std_weeks,from_ATL,from_BOS,from_CLT,from_DEN,from_DFW,from_DTW,...,to_LAX,to_LGA,to_MCO,to_MIA,to_MSP,to_ORD,to_PHL,to_PHX,to_SEA,to_SFO
0,2012-06-19,12.875,12.331296,9.812647,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2012-09-10,14.285714,10.775182,9.466734,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2012-10-05,10.863636,11.083177,9.035883,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2011-10-09,11.48,11.169268,7.990202,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2012-02-21,11.45,11.269364,9.517159,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
#Convert flight_date type to datetime
df['flight_date'] = df['flight_date'].apply(pd.to_datetime)

In [12]:
#Convert flight_date to new columns: year, month and day
df['year'] = pd.DatetimeIndex(df['flight_date']).year
df['month'] = pd.DatetimeIndex(df['flight_date']).month
df['day'] = pd.DatetimeIndex(df['flight_date']).day

In [13]:
#Encode categorical variables
df = pd.get_dummies(data=df, columns=['year', 'month', 'day'])
df.head()

Unnamed: 0,flight_date,avg_weeks,target,std_weeks,from_ATL,from_BOS,from_CLT,from_DEN,from_DFW,from_DTW,...,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_30,day_31
0,2012-06-19,12.875,12.331296,9.812647,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2012-09-10,14.285714,10.775182,9.466734,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2012-10-05,10.863636,11.083177,9.035883,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2011-10-09,11.48,11.169268,7.990202,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2012-02-21,11.45,11.269364,9.517159,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#Divide the trainig set into training and test sets to validate our model before we fit it on the whole train
X = df.drop(['flight_date', 'target'], axis=1).values
y = df['target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [15]:
# Build our baseline random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE
rf = RandomForestRegressor(random_state=0)
# Fit the model
rf.fit(X_train, y_train)
# Measure model performance
y_pred = rf.predict(X_test)
#Compute rmse on train
mse = MSE(y_test, y_pred)
# Compute test-set RMSE
rmse = mse**(1/2)
print(rmse)

0.7314846321478717


In [16]:
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [17]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [18]:
#Best parameters
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [19]:
best_random = rf_random.best_estimator_
# Fit the model
best_random.fit(X_train, y_train)
# Measure model performance
y_pred = best_random.predict(X_test)
#Compute rmse on train
mse = MSE(y_test, y_pred)
# Compute test-set RMSE
rmse = mse**(1/2)
print(rmse)

0.6974470809946727


In [20]:
rf_r = RandomForestRegressor(random_state=0)
# Fit the model
rf_r.fit(X_train, y_train)
# Measure model performance
y_pred = rf_r.predict(X_test)
#Compute rmse on train
mse = MSE(y_test, y_pred)
# Compute test-set RMSE
rmse = mse**(1/2)
print(rmse)

0.7314846321478717


In [21]:
#Convert flight date type to datetime type
#df['flight_date'] = df['flight_date'].apply(pd.to_datetime)

In [None]:
# """ the Metadata API"""

# from joblib import Memory
# import requests
# location = './cachedir'
# memory = Memory(location, verbose=0)

# headers = {'x-rapidapi-host': "airport-info.p.rapidapi.com",
#            'x-rapidapi-key': "670c14cef2mshc0f539c8f496dbcp19f4cejsn7e4b16db61a9"}

# def metadata_airport(code):
#     """
#     we get meta data for each airport
#     """
#     url = "https://airport-info.p.rapidapi.com/airport"
#     querystring = {"iata" : code, "icao" : code}
#     response = requests.request("GET", url, headers=headers, params=querystring)
#     # print(response.text)
#     response = dict(eval(response.text))
    
#     return response["city"]

# metadata_airport = memory.cache(metadata_airport)

# metadata_airport('CMN')

In [None]:
# #Add two new columns with cities
# df["destination"] = ""
# for i in df.index:
#   df["destination"][i] = metadata_airport(df["to"][i])
# df.head()

In [None]:
# #Replace DFW Airport by Texas
# df['destination'].replace('DFW Airport', 'Texas', inplace=True)

In [None]:
# start = df['flight_date'].min()

In [None]:
# end = df['flight_date'].max()

In [None]:
# unique_dest = df['destination'].unique().tolist()
# unique_dest = list(filter(None, unique_dest))
# unique_dest

In [None]:
#!pip install WorldWeatherPy

In [None]:
# from WorldWeatherPy import DetermineListOfAttributes
# from WorldWeatherPy import HistoricalLocationWeather
# from WorldWeatherPy import RetrieveByAttribute

In [None]:
# data = pd.DataFrame(columns = ['totalSnow_cm', 'precipMM', 'tempC', 'windspeedKmph'])
# for x in unique_dest:
#   dataset = HistoricalLocationWeather('dd141467b93a4455ab7100637220501', x, start, end, 12).retrieve_hist_data()
#   dataset = dataset[['totalSnow_cm', 'precipMM', 'tempC', 'windspeedKmph']]
#   dataset = pd.DataFrame(dataset.values.reshape(-1,2,df.shape[1]).mean(1))
#   data.append(dataset)
# data.head()

In [None]:
# dataset = HistoricalLocationWeather('dd141467b93a4455ab7100637220501', 'Texas', start, end, 12).retrieve_hist_data()
# weather = dataset[['totalSnow_cm', 'precipMM', 'tempC', 'windspeedKmph']]
# weather.head()
# #weather = weather.apply(pd.to_numeric)