In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor

In [None]:
def replace_nan(df):
    # get a list of all the columns containing NaN
    nan_cols = df[df.columns[df.isnull().any()]].columns
    # compute and fill each NaN with the columns mean    
    df[nan_cols] = df[nan_cols].fillna(value=df[nan_cols].mean())

    
def show_nans(df):
    print(np.unique(df['station']))
    print(df[df.columns[df.isnull().any()]].columns)
    print()
    

# converting weekdays into integers [1-7]
def convert_weekdays(df):
    df = df.replace(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    [1, 2, 3, 4, 5, 6, 7], inplace=True)

In [None]:
# Adding all files into one DataFrame
df = []
for path in Path('./Train/Train').rglob('*.csv'):
    tmp = pd.read_csv(path)
    # comment next line if not averaging NaNs  
#     show_nans(tmp)
    replace_nan(tmp)
    df.append(tmp)

df = pd.concat(df, ignore_index=True)

convert_weekdays(df)

# deleting unneeded columns
del df["month"]
del df["year"]

# comment next line if not dropping NaNs
# df = df.dropna(axis='rows')

# See all Rows/Cols
# pd.set_option('display.max_columns', 23)
pd.set_option('display.max_rows', 23)

In [None]:
df

In [None]:
df.isnull().any()

In [None]:
forest = RandomForestRegressor(n_jobs=6)

forest.fit(df.iloc[:,:-1], df["bikes"])



In [None]:
importances = forest.feature_importances_

imp_indexes = np.argsort(importances)[::-1]

print(df.columns[imp_indexes])
print(importances[imp_indexes])

In [None]:

import matplotlib.pyplot as plt


fig = plt.figure(figsize=(16,8))
plt.scatter(df["bikes_3h_ago"], df["bikes"], alpha=0.5)
plt.show()

In [None]:
import seaborn as sns

sns.jointplot(
    data=df,
    x="bikes_3h_ago", y="bikes",# hue="weekday",
    kind="kde"
)

# Baseline - bikes = bikes three hours ago


In [None]:
uniques = np.unique(df["station"], return_counts=True)

There are very few datapoints for each station compared to the number of attributes

This means that ensemble models would rapidly over-parameterise


In [None]:
df_station = df[df["station"] == 201]

plt.plot(df_station["timestamp"], df_station["bikes"])
%plt.plot(df_station["timestamp"], df_station["bikes_3h_ago"])