In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR

In [2]:
def replace_nan(df):
    # get a list of all the columns containing NaN
    nan_cols = df[df.columns[df.isnull().any()]].columns
    # compute and fill each NaN with the columns mean    
    df[nan_cols] = df[nan_cols].fillna(value=df[nan_cols].mean())

    
def show_nans(df):
    print(np.unique(df['station']))
    print(df[df.columns[df.isnull().any()]].columns)
    print()
    

# converting weekdays into integers [1-7]
def convert_weekdays(df):
    df = df.replace(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    [1, 2, 3, 4, 5, 6, 7], inplace=True)
    
def score_abs_error(model, data):
    y_pred = model.predict(data.iloc[:,:-1].to_numpy())
    y_gold = data["bikes"].to_numpy()
    
    return mean_absolute_error(y_gold, y_pred)

In [3]:
# Adding all files into one DataFrame
df = []
for path in Path('./Train/Train').rglob('*.csv'):
    tmp = pd.read_csv(path)
    # comment next line if not averaging NaNs  
#     show_nans(tmp)
    replace_nan(tmp)
    df.append(tmp)

df = pd.concat(df, ignore_index=True)

convert_weekdays(df)

# deleting unneeded columns
del df["month"]
del df["year"]

# comment next line if not dropping NaNs
# df = df.dropna(axis='rows')

# See all Rows/Cols
# pd.set_option('display.max_columns', 23)
pd.set_option('display.max_rows', 23)


scaler = StandardScaler()
df[df.columns[:-1]] = scaler.fit_transform(df[df.columns[:-1]])

In [6]:
forest = RandomForestRegressor(n_estimators=500, n_jobs=6)
print("initialised")
forest.fit(df.iloc[:,:-1].to_numpy(), df["bikes"].to_numpy())

importances = forest.feature_importances_
imp_indixes = np.argsort(importances)[::-1]
feature_order = df.columns[:-1][imp_indixes]
importances = importances[imp_indixes]

imp_df = pd.DataFrame(data = importances, index = feature_order, columns=["relative_importance"])

lowest_ranked_10 = feature_order[-10:]
for feature in lowest_ranked_10:
    del df[feature]





initialised


In [7]:
train, test = train_test_split(df, test_size=0.1, random_state=42)

In [8]:
df

Unnamed: 0,station,longitude,numDocks,timestamp,hour,weekhour,airPressure.mb,bikes_3h_ago,full_profile_3h_diff_bikes,full_profile_bikes,short_profile_3h_diff_bikes,short_profile_bikes,bikes
0,-1.478148,0.085531,0.778671,-1.729727,-1.658502,-0.764152,-3.458244,-0.158975,-0.000507,-0.209238,-0.000507,-0.209238,0.0
1,-1.478148,0.085531,0.778671,-1.725078,-1.514125,-0.742678,-0.038999,-0.158975,-0.000507,-0.209238,-0.000507,-0.209238,0.0
2,-1.478148,0.085531,0.778671,-1.720428,-1.369747,-0.721203,-2.862877,-0.158975,-0.000507,-0.209238,-0.000507,-0.209238,0.0
3,-1.478148,0.085531,0.778671,-1.715778,-1.225369,-0.699728,-3.352349,-1.138946,-0.000507,-0.209238,-0.000507,-0.209238,0.0
4,-1.478148,0.085531,0.778671,-1.711128,-1.080992,-0.678254,-2.451061,-1.138946,-0.000507,-0.209238,-0.000507,-0.209238,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55870,-1.385764,-0.120714,2.532435,1.711128,1.084674,0.674650,0.499891,0.698655,0.060055,0.467717,0.060055,0.467717,6.0
55871,-1.385764,-0.120714,2.532435,1.715778,1.229051,0.696125,0.509304,0.239255,-0.547681,-0.164887,-0.547681,-0.164887,7.0
55872,-1.385764,-0.120714,2.532435,1.720428,1.373429,0.717600,0.511657,-0.220146,-0.426134,-0.255259,-0.426134,-0.255259,6.0
55873,-1.385764,-0.120714,2.532435,1.725078,1.517807,0.739074,0.521070,-0.220146,-1.641607,-0.752305,-1.641607,-0.752305,5.0


In [13]:
forest_boost = GradientBoostingRegressor(n_estimators=5000, subsample=0.95, loss='squared_error', learning_rate=0.1, max_depth=3, verbose=1)
print("initialised")
forest_boost.fit(train.iloc[:,:-1].to_numpy(), train["bikes"].to_numpy())

print(score_abs_error(forest_boost, test))

initialised
      Iter       Train Loss      OOB Improve   Remaining Time 
         1          38.1258           4.2877            6.79m
         2          34.4016           3.9681            5.60m
         3          31.5395           2.9750            5.18m
         4          28.8128           2.7753            5.02m
         5          26.8316           2.2094            4.90m
         6          25.1279           1.6014            4.82m
         7          23.5687           1.5719            4.77m
         8          22.2684           1.3384            4.73m
         9          21.2929           1.0294            4.69m
        10          20.3374           0.8797            4.66m
        20          16.0380           0.2227            4.47m
        30          14.8273           0.1169            4.41m
        40          14.2784           0.0452            4.42m
        50          14.0455           0.0334            4.41m
        60          13.7827          -0.0099            4

In [11]:
from time import time

train_sample = train.sample(frac = 0.25)

ada = AdaBoostRegressor(n_estimators=50)
print("initialised")
start = time()
ada.fit(train_sample.iloc[:,:-1].to_numpy(), train_sample["bikes"].to_numpy())
print(f"ada boost fitted, it took {time() - start}s")
print(score_abs_error(ada, test))

initialised
ada boost fitted, it took 220.29566478729248s
4.503117809017855
