<a href="https://colab.research.google.com/github/SpoilStick/ML-Projects/blob/main/ML%20Assignment%204.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

from tabulate import tabulate

# Part 1: Data Exploration

In [None]:
data = pd.read_csv("bike_share_hour.csv")

In [None]:
data.head()

In [None]:
print(data["season"].unique())

In [None]:
# Categorical?
# It seems like the categorical variables are already converted
data = data.drop(columns=["dteday"])
data.dropna()

In [None]:
def bar_chart(df, x_axis, y_axis):
  # Sum for each unique x value
  for x in df[x_axis].unique():
    # Get the columns
    sum = df.loc[df[x_axis]==x]
    # Sum over that column
    sum = sum[y_axis].sum()
    plt.bar(x, sum)
  
  plt.ylabel = y_axis # Neither label appears, can't figure out why
  plt.xlabel = x_axis

In [None]:
 # CNT vs season total bar graph
bar_chart(data, "season","cnt")

# Season 3 (Fall?) has the most bike rides
# Season 1 (Spring?) has the least bike rides

In [None]:
 # Working day vs count bar graph
 # 1 is a work day, 0 isn't
bar_chart(data, "workingday", "cnt")

# A majority of bike rides are on working days

In [None]:
 # Month vs count total bar graph
bar_chart(data, "mnth", "cnt")

# Most bike rides are in the summer (the middle of
# the year), before steeply dropping in october to
# november

In [None]:
def month_to_season(month):
  return np.ceil(month / 3)

#for x in range(1, 13):
#  print(month_to_season(x))

In [None]:
 # Weather vs count total bar graph
bar_chart(data, "weathersit", "cnt")

# No clue what each value means

In [None]:
ax = sns.pointplot(x="weathersit", y="cnt", hue="season", data=data)

In [None]:
 # Hour vs count bar graph
bar_chart(data, "hr", "cnt")

# In the morning bikes peak at 8 A.M., anad in the afternoon peak at 5 and 6 P.M.

In [None]:
 # Hour vs count on weekends/holidays bar graph
no_work_data = data[data["workingday"]==0]
bar_chart(no_work_data, "hr", "cnt")

# Without workdays, bike rides fluctuate smoothly, peaking afternoon at 

# Part 2: Data Preparation

In [None]:
corr_matrix = data.corr()

plt.figure(figsize=(16,16))
ax= sns.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
# Data already scaled?
scalar = StandardScaler()
scalar.fit(data)
scalar.transform(data)

In [None]:
# Drop columns
print(data)
data = data.drop(columns=["instant", "casual", "registered"])

In [None]:
ax = data["cnt"].hist()
# This distribution is skewed right

In [None]:
X = data
y = X.pop("cnt")

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=1/3, random_state=42)

In [None]:
lin_reg = LinearRegression().fit(X_train ,y_train)
lin_predictions = lin_reg.predict(X_train)

In [None]:
r2 = r2_score(y_train, lin_predictions)
print("r2:", r2)

MSE = mean_squared_error(y_train, lin_predictions)
print("MSE:", MSE)

RMSE = np.sqrt(MSE)
print("RMSE:", RMSE)

# Part 3: Model Training

In [None]:
encoded_data = pd.get_dummies(data=data, columns=["season", "yr", "mnth", "hr", "weekday", "weathersit"])

X_train, X_test, y_train, y_test = train_test_split(encoded_data, y, test_size=1/3, random_state=42)

In [None]:
models = [
    LinearRegression(), 
    DecisionTreeRegressor(random_state=0), 
    RandomForestRegressor(random_state=0, n_estimators=30), 
    SGDRegressor(max_iter=1000, tol=1e-3), 
    linear_model.Lasso(alpha=0.1), 
    ElasticNet(random_state=0), 
    Ridge(alpha=0.5), 
    BaggingRegressor()]

table = pd.DataFrame(columns=["Model", "r2 coefficient", "MSE", "RMSE"])

In [None]:
for model in models:
  model.fit(X_train, y_train)
  predictions = model.predict(X_train)

  r2 = r2_score(y_train, predictions)
  MSE = mean_squared_error(y_train, predictions)
  RMSE = np.sqrt(MSE)

  table.loc[len(table.index)] = [model, r2, MSE, RMSE]

In [None]:
print(tabulate(table, headers='keys', tablefmt='psql'))

# Part 4: Model Tuning

In [None]:
table = table.sort_values("MSE")[:3]

In [None]:
cross_val_scores = []

for model in table["Model"]:
  cross_val = cross_val_score(model, X_train, y_train)
  cross_val_scores.append(cross_val[4])

In [None]:
table["Cross Validation Score"] = cross_val_scores

In [None]:
table = table.sort_values("Cross Validation Score", ascending=False)[:1]

In [None]:
print(table)

In [None]:
print(table["Model"].iloc[0])

In [None]:
param_distribs = {
        'bootstrap': [True, False],
        'max_depth': range(10, 120, 10), # 10 - 110
        'max_features': ["auto", "sqrt"],
        'min_samples_split': [2,5,10],
        'min_samples_leaf': [1,2,4],
        'n_estimators': range(20, 220, 20) # 20 - 200
    }

rnd_search = RandomizedSearchCV(table["Model"].iloc[0], param_distribs, n_jobs=3, n_iter=20, cv=3)

In [None]:
rnd_search.fit(X_train, y_train)

In [None]:
rnd_search.best_params_

In [None]:
final_model = rnd_search.best_estimator_
final_predictions = final_model.predict(X_train)

In [None]:
final_r2 = r2_score(y_train, final_predictions)
print("r2:", final_r2)

final_MSE = mean_squared_error(y_train, final_predictions)
print("MSE:", final_MSE)

final_RMSE = np.sqrt(final_MSE)
print("RMSE:", final_RMSE)

print("Cross Validation Score:", cross_val_score(final_model, X_train, y_train))

In [None]:
print("Test r2:", r2_score(y_test, final_model.predict(X_test)))
print("Test RMSE:", mean_squared_error(y_test, final_model.predict(X_test), squared=False))