In [None]:
import pathlib
import shutil
import glob
import os
import re
import ftfy
import pandas as pd
import statsmodels.api as sm
import numpy as np

import sklearn
import sklearn.linear_model

import bike_share.eval
import bike_share.utils as ut

pd.options.display.max_rows = 1000
%load_ext autoreload
%autoreload 2

In [None]:
data_path = ut.get_data_path()

In [None]:
df = pd.read_csv(data_path / "consolidated_DF_2019.csv", parse_dates=["trip_start_time", "trip_stop_time"])
rng = np.random.default_rng(42)
sampled_df = df.sample(n=200000, random_state=rng)
df = sampled_df

In [None]:
df["trip_start_daily_minutes"] = df["trip_start_time"].apply(lambda x: x.hour*60 + x.minute)
df["is_casual"] = df["is_casual"]*1

In [None]:
df["trip_binned_daytime"] = np.floor(df["trip_start_daily_minutes"]/(1*60)) # round down to hours
df["trip_binned_daytime"] = df["trip_binned_daytime"].astype(int)

In [None]:
df["trip_binned_daytime"].hist(bins=100)

In [None]:
oh_hours_df = pd.get_dummies(df["trip_binned_daytime"])
oh_station_id_df = pd.get_dummies(df["from_station_id"])

In [None]:
df["trip_start_daily_minutes"].hist(bins=100)

In [None]:
(df["trip_duration_seconds"]).apply(np.log).hist(bins=100)

In [None]:
regression = sklearn.linear_model.LinearRegression()

In [None]:
hours_df = oh_hours_df.drop(columns = oh_hours_df.columns[0])

In [None]:
station_df = oh_station_id_df.drop(columns= oh_station_id_df.columns[0])

In [None]:
x_df = pd.concat([hours_df, station_df, df["is_casual"]], axis=1)
x_df.columns = [str(x) for x in x_df.columns]
y = df["trip_duration_seconds"]

Train/val/test-split .6/.2/.2

In [None]:
X_train, X_testval, y_train, y_testval = sklearn.model_selection.train_test_split(x_df, y, test_size=0.4, random_state=1337)
X_val, X_test, y_val, y_test = sklearn.model_selection.train_test_split(X_testval, y_testval, test_size=0.5, random_state=7331)

In [None]:
model = regression.fit(X_train,y_train)

In [None]:
col_coeffs_sorted = bike_share.eval.get_regression_info(model)

In [None]:
[col_coeffs_sorted[i][1] for i in range(488) if col_coeffs_sorted[i][0] > 0 ]

In [None]:
bike_share.eval.regression_results(X_train, y_train, model)
bike_share.eval.regression_results(X_val, y_val, model)
bike_share.eval.regression_results(X_test, y_test, model)