In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction import DictVectorizer

## Preprocessing

In [4]:
ref_data = pd.read_csv('../data/interim/ref_data.csv')

In [5]:
# Reformat column titles also removing unit measurements
ref_data.columns = ref_data.columns.str.lower().str.replace("\s*\(.*\)\s*", "", regex=True).str.replace(' ', '_')

In [6]:
#from date column (string format) create new column with progressive day numbers (int type)
ref_data['day_number'] = pd.to_datetime(ref_data['date'], dayfirst=True)
ref_data['day_number'] = (ref_data['day_number'] - ref_data['day_number'].min()).dt.days + 1

ref_data['day_number'] = ref_data['day_number'].map({value: index+1 for index, value in enumerate(ref_data['day_number'].unique())})

In [7]:
#add a column with day of the week
ref_data['weekday'] = pd.to_datetime(ref_data['date'], dayfirst=True).dt.strftime('%A')

In [8]:
ref_data.drop(columns='date', inplace=True)

In [9]:
#filtering out rows for not functioning days (deterministic relation: no_functioning -> no rented bike for that day)
ref_data = ref_data.loc[ref_data['functioning_day']=='Yes']

In [10]:
ref_data.drop(columns='functioning_day', inplace=True)

In [11]:
y = ref_data.rented_bike_count.values
X = ref_data.drop(columns='rented_bike_count',axis=1)

### Baseline model

In [12]:
# qulitative maanual feature selection
X_sel = X[['temperature', 'humidity', 'hour', 'day_number', 'rainfall', 'seasons', 'weekday']]

In [13]:
numerical = list(X_sel.select_dtypes(include=['int64', 'float64']).columns)
categorical = list(X_sel.select_dtypes(include='object').columns)

In [14]:
df2_train, df2_val, y_train, y_val = train_test_split(
        X_sel, y, random_state=42, stratify=X['seasons']
)

In [15]:
dv = DictVectorizer(sparse=False)

train_dicts = df2_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df2_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

436.1865854091941