In [5]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR

In [6]:
def replace_nan(df):
    # get a list of all the columns containing NaN
    nan_cols = df[df.columns[df.isnull().any()]].columns
    nan_cols = nan_cols.drop('bikes')
    # compute and fill each NaN with the columns mean
    df[nan_cols] = df[nan_cols].fillna(value=df[nan_cols].mean())

    
def show_nans(df):
    print(np.unique(df['station']))
#     print(df[df.columns[df.isnull().any()]].columns)
    print(df.isnull().any())
    print()
    

# converting weekdays into integers [1-7]
def convert_weekdays(df):
    df = df.replace(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    [1, 2, 3, 4, 5, 6, 7], inplace=True)
    
def score_abs_error(model, data, round_ = False):
    if round_ == True:
        y_pred = np.around(  model.predict(data.iloc[:,:-1].to_numpy())  )
    else:
        y_pred = model.predict(data.iloc[:,:-1].to_numpy())
    y_gold = data["bikes"].to_numpy()
    
    return mean_absolute_error(y_gold, y_pred)

def reasonable_predictions(model, data):
    y_pred = model.predict(data.to_numpy())
    
    y_pred = np.around(y_pred)
    return y_pred


## This code is added

In [9]:
# Adding all files into one DataFrame
df = []
for path in Path('./Train/Train').rglob('*.csv'):
    tmp = pd.read_csv(path)
    # comment next line if not averaging NaNs  
#     show_nans(tmp)
#     replace_nan(tmp)
#     tmp = tmp.dropna(axis='rows')
    df.append(tmp)

df = pd.concat(df, ignore_index=True)
df.shape[0] - df.dropna().shape[0]

13275

In [None]:
# Adding all files into one DataFrame
df = []
for path in Path('./Train/Train').rglob('*.csv'):
    tmp = pd.read_csv(path)
    # comment next line if not averaging NaNs  
    show_nans(tmp)
    replace_nan(tmp)
    tmp = tmp.dropna(axis='rows')
    df.append(tmp)

df = pd.concat(df, ignore_index=True)

convert_weekdays(df)

# deleting unneeded columns
del df["month"]
del df["year"]

# comment next line if not dropping NaNs

columns = list(df.columns[-6:-1])
print(columns)
for c in columns:
    df[c] = df[c].to_numpy() / df["numDocks"].to_numpy()
    
print(df.head())

# See all Rows/Cols
# pd.set_option('display.max_columns', 23)
pd.set_option('display.max_rows', 23)


# scaler = StandardScaler()
# df[df.columns[:-1]] = scaler.fit_transform(df[df.columns[:-1]])

print(df.columns)

In [None]:
import matplotlib.pyplot as plt

plt.hist(df["bikes"])

In [None]:
forest = RandomForestRegressor(n_estimators=500, n_jobs=6)
print("initialised")
forest.fit(df.iloc[:,:-1].to_numpy(), df["bikes"].to_numpy())

importances = forest.feature_importances_
imp_indixes = np.argsort(importances)[::-1]
feature_order = df.columns[:-1][imp_indixes]
importances = importances[imp_indixes]

imp_df = pd.DataFrame(data = importances, index = feature_order, columns=["relative_importance"])

print(score_abs_error(forest, df))

In [None]:
# print(imp_df)
# X = df.iloc[:,:-1].to_numpy()
# Y = df["bikes"].to_numpy()

# from sklearn.decomposition in_estimators=t PCA
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt

# pca = PCA(n_components = 2, whiten = True)
# X_pca = pca.fit_transform(X)

# print(X_pca.shape)

# fig, ax = plt.subplots(figsize=(12,12))

# ax.scatter(X_pca[:,0], X_pca[:,1], c = Y, edgecolor = '0', alpha=0.5)

# plt.show()

# print(np.corrcoef(X_pca.transpose(), Y))

In [None]:
# import seaborn as sns

# bins = np.linspace(0, np.max(Y), 8)
# digitized = np.digitize(Y, bins)
# bin_means = [Y[digitized == i].mean() for i in range(1, len(bins))]

# fig, ax = plt.subplots(figsize=(12,12))
# sns.kdeplot(
#     x=X_pca[:,0], y=X_pca[:,1], ax = ax, warn_singular=False, fill = True, hue=digitized
# )#
# # plt.show()

In [None]:
# plot_features = feature_order[:5].to_list()
# plot_features.append("bikes")
# print(plot_features)

# g = sns.PairGrid(df[plot_features], diag_sharey=False, corner=True)
# g.map_upper(sns.scatterplot)
# g.map_lower(sns.kdeplot)
# g.map_diag(sns.kdeplot)


In [None]:


lowest_ranked_10 = feature_order[-5:]
for feature in lowest_ranked_10:
    del df[feature]
print(imp_df)





In [None]:
train, val = train_test_split(df, test_size=0.2)

In [None]:
df

# Random elimination parameter tuning
## Random forest regressor

This cell uses `HalvingRandomSearchCV` to find near-optimal parameters for a random forest regressor. It takes a while to run with these parameters.

In [None]:
from time import time
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from scipy.stats import randint


# forest_boost = GradientBoostingRegressor(n_estimators=n_est, loss='squared_error', learning_rate=0.2, max_depth=2, verbose=1)
#  forest_boost = SVR()
searched_boost = RandomForestRegressor()
searched_boost = GradientBoostingRegressor()



param_distributions = {"max_depth":  [2,3,4],   #, 5, 6, None],
                       "min_samples_split": np.around(np.linspace(2,20,10)).astype(np.int32),
                       "learning_rate": np.linspace(0.0001,1,10),
                       "n_estimators": np.linspace(5, 5000, 50).astype(np.int32)
                      }

search = HalvingRandomSearchCV(searched_boost, param_distributions,
                               resource='n_samples', aggressive_elimination=True, min_resources = 1000,
                               factor = 2 ,cv = 2,    # n_candidates =  25
                               random_state=0, verbose=1, n_jobs=6).fit(train.iloc[:,:-1].to_numpy(), train["bikes"].to_numpy())

print(search.best_params_)

In [None]:
from time import time
start = time()
print("initialised")
#forest_boost = GradientBoostingRegressor(**search.best_params_)
forest_boost = GradientBoostingRegressor(n_estimators= 5000, min_samples_split= 8, max_depth= 3, learning_rate= 0.11120000000000001, verbose = 1)
forest_boost.fit(train.iloc[:,:-1].to_numpy(), train["bikes"].to_numpy())
print(f'fitted in {time() - start}s')

In [None]:
print(score_abs_error(forest_boost, val))

In [None]:


results = pd.DataFrame(search.cv_results_)
results["params_str"] = results.params.apply(str)
params = search.param_distributions
# results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
learning_rates = params["learning_rate"]
mean_scores = results.pivot(
    index="iter", columns="params_str", values="mean_test_score"
)

fig, ax = plt.subplots(figsize=(16,12))
mean_scores.plot(legend=False, alpha=0.6, ax = ax, linewidth=8)

labels = [
    f"iter={i}\nn_samples={search.n_resources_[i]} \nn_candidates={search.n_candidates_[i]}"# \nn_estimators={params["n_estimators"][i]} "
    for i in range(search.n_iterations_)
]

ax.set_xticks(range(search.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("mean test score", fontsize=15)
ax.set_xlabel("iterations", fontsize=15)
plt.tight_layout()
plt.show()


In [None]:
# try:
#     n_est = search.best_params_["n_estimators"]
# except:
n_est = 5000

import matplotlib.pyplot as plt
test_score = np.zeros((n_est,), dtype=np.float64)

y_test = val["bikes"]
#y_pred = reasonable_predictions(forest_boost, val.iloc[:, :-1])

for i, y_pred in enumerate(forest_boost.staged_predict(val.iloc[:,:-1])):
    test_score[i] = forest_boost.loss_(y_test, np.around(y_pred))


fig = plt.figure(figsize=(12, 12))
plt.subplot(1, 1, 1)
plt.title("Training and validation error")
plt.plot(
    np.arange(n_est) + 1,
    forest_boost.train_score_,
    "b-",
    label="Training Set",
)
plt.plot(
    np.arange(n_est) + 1, test_score, "r-", label="Validation Set"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Squared error")
plt.yscale('log')
fig.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.hist(y_test - forest_boost.predict(val.iloc[:,:-1]), bins = 30)
plt.show()

In [None]:
test = pd.read_csv('test.csv')

ids = test["Id"]

del test["Id"]
del test["month"]
del test["year"]

convert_weekdays(test)
# test[test.columns] = scaler.fit_transform(test[test.columns])

for feature in lowest_ranked_10:
    del test[feature]
    
print(test.columns)  
print(train.columns)

#y_pred = forest_boost.predict(test)
y_pred = reasonable_predictions(forest_boost, test)

sub_df = pd.DataFrame(data=y_pred, index = ids, columns = ["bikes"])

sub_df.index.name = 'Id'

print(sub_df.head())

sub_df.to_csv("trial.csv")