# Import, Cleaning, and Split

In [13]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("./data/train.csv")
kaggle_test = pd.read_csv("./data/test.csv")

In [14]:
def cleanup(df):
    df["datetime"] = pd.to_datetime(df["datetime"])
    df["year"] = df["datetime"].dt.year
    df["month"] = df["datetime"].dt.month
    df["day"] = df["datetime"].dt.day
    df["hour"] = df["datetime"].dt.hour
    df["dayofweek"] = df["datetime"].dt.dayofweek # Monday = 0
    
    df["season"] = "" # season based on actual seasons, 1 Nov-Jan, 2 Feb-Apr, 3 Mar-May, 4 Jun-Aug to include school year
    df.loc[df.month == 12, "season"] = "1"
    df.loc[df.month == 1, "season"] = "1"
    df.loc[df.month == 2, "season"] = "1"
    df.loc[df.month == 3, "season"] = "2"
    df.loc[df.month == 4, "season"] = "2"
    df.loc[df.month == 5, "season"] = "2"
    df.loc[df.month == 6, "season"] = "3"
    df.loc[df.month == 7, "season"] = "3"
    df.loc[df.month == 8, "season"] = "3"
    df.loc[df.month == 9, "season"] = "4"
    df.loc[df.month == 10, "season"] = "4"
    df.loc[df.month == 11, "season"] = "4"

cleanup(df)
cleanup(kaggle_test)

In [15]:
y = df["count"]
X = df.drop(["casual", "registered", "atemp", "datetime", "holiday", "workingday", "day", "windspeed", "weather"], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 314)

# Feature Engineering

In [16]:
cat_pipe = make_pipeline(OneHotEncoder(handle_unknown = "ignore", sparse = False))

pol_pipe = make_pipeline(PolynomialFeatures(degree = 4, interaction_only = False, include_bias = False))

bin_pipe2 = make_pipeline(KBinsDiscretizer(n_bins = 4, strategy = "kmeans"))

feature_transform = ColumnTransformer(transformers = [
    ("bin2", bin_pipe2, ["humidity", "temp"]),
    ("poly", pol_pipe, ["temp", "hour", "month", "humidity", "year"]),
    ("cat", cat_pipe, ["year", "month", "hour", "dayofweek", "season"])
])

X_train_transform = feature_transform.fit_transform(X_train)
X_test_transform = feature_transform.transform(X_test)
kaggle_transform = feature_transform.transform(kaggle_test)



# Fit and Test

In [17]:
m = RandomForestRegressor()
m = m.fit(X_train_transform, y_train)
ypred = m.predict(X_test_transform)

print(m.score(X_train_transform, y_train))
print(m.score(X_test_transform, y_test))

0.9874533226917952
0.9161929358846879


In [19]:
# Cross-validation
cross_r2_lin = cross_val_score(m, X_train_transform, y_train, cv = 5, scoring = "r2", verbose = 3)

sklearn.metrics.SCORERS.keys()
print(cross_r2_lin)
print(cross_r2_lin.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.900) total time=  32.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.6s remaining:    0.0s


[CV] END ................................ score: (test=0.899) total time=  32.1s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s


[CV] END ................................ score: (test=0.913) total time=  32.2s
[CV] END ................................ score: (test=0.912) total time=  32.6s
[CV] END ................................ score: (test=0.915) total time=  32.3s
[0.90001727 0.8989614  0.91332778 0.91166962 0.9147386 ]
0.9077429359465784


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.7min finished


In [20]:
# RMSLE
ypred[ypred < 0] = 0.0

print(np.sqrt(mean_squared_log_error(y_test, ypred)))

0.4135611633047051


# Export for Upload

In [21]:
kaggle_rf = m.predict(kaggle_transform)
kaggle_rf[kaggle_rf < 0] = 0.0

kaggle_pred_rf = pd.DataFrame({"datetime": kaggle_test["datetime"], "count": kaggle_rf[0:]})

kaggle_pred_rf.to_csv("./output/kaggle_random_forest.csv", index = False)