In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import urllib.request, json # we will need urllib to communicate with the api and then json to read the data we get

## Read in Data

In [2]:
marburg = pd.read_pickle("../0_data/marburg_clean.pkl")
duisburg = pd.read_pickle('../0_data/duisburg_clean.pkl')

nrw_holidays = pd.read_pickle("../0_data/nrw_holidays.pkl")
he_holidays = pd.read_pickle("../0_data/he_holidays.pkl")

marburg_weather = pd.read_pickle("../0_data/weather/marburg_weather.pkl")
duisburg_weather = pd.read_pickle("../0_data/weather/duisburg_weather.pkl")

## Setting possible features

As possible features for the demand of bicycles we calculate time features like _day of year_ , _week_ , _weekday_ , _is_weekday_ and _hour_ . Also maybe the holidays play a role so we also added the feature _is_holiday_ . Furthermore we merged the weather data to get _temperature, precipitation, cloud_coverage_ and _wind_velocity_ .

In [3]:
#marburg
ma = pd.DataFrame(marburg.resample('H').count()["day"])
ma.rename(columns={'day': 'demand'}, inplace=True)

ma['dayofyear'] = ma.index.map(lambda datetime : datetime.dayofyear)
ma['week'] = ma.index.map(lambda datetime : datetime.week)
ma['weekday'] = ma.index.map(lambda datetime : datetime.weekday)
ma['is_weekday'] = ma['weekday'].map(lambda day : day < 5)
ma['hour'] = ma.index.map(lambda datetime : datetime.hour)

ma = ma.merge(marburg_weather, left_index=True, right_index=True)

In [4]:
ma['is_holiday'] = he_holidays['is_holiday']
ma = ma.fillna(False)

In [5]:
#duisburg
du = pd.DataFrame(duisburg.resample('H').count()["day"])
du.rename(columns={'day': 'demand'}, inplace=True)

du['dayofyear'] = du.index.map(lambda datetime : datetime.dayofyear)
du['week'] = du.index.map(lambda datetime : datetime.week)
du['weekday'] = du.index.map(lambda datetime : datetime.weekday)
du['is_weekday'] = du['weekday'].map(lambda day : day < 5)
du['hour'] = du.index.map(lambda datetime : datetime.hour)

du = du.merge(duisburg_weather, left_index=True, right_index=True)

In [6]:
du['is_holiday'] = nrw_holidays['is_holiday']
du = du.fillna(False)

## Feature Selection with Lasso Regularization

In [7]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [8]:
scaler = StandardScaler()

In [9]:
#Setting x and y values for Marburg
X_ma = scaler.fit_transform(ma[['dayofyear','week', 'weekday', 'hour', 'temperature', 'precipitation', 'cloud_coverage', 'wind_velocity']].values)
X_ma = np.concatenate((X_ma,ma[['is_weekday', 'is_holiday']].values), axis=1)
y_ma = ma['demand'].values

In [10]:
#Setting x and y values for Duisburg
X_du = scaler.fit_transform(du[['dayofyear','week', 'weekday', 'hour', 'temperature', 'precipitation', 'cloud_coverage', 'wind_velocity']].values)
X_du = np.concatenate((X_du,du[['is_weekday', 'is_holiday']].values), axis=1)
y_du = du['demand'].values

In [11]:
#creating model
selection_model = Lasso(alpha = 0.5)

In [12]:
#Marburg features
selection_model.fit(X_ma,ma['demand'])
selection_model.coef_

array([ 0.        ,  0.54045036, -1.69769162,  3.46357418,  7.42037199,
       -0.8621322 , -0.        , -0.        ,  0.46587421, -0.        ])

In [13]:
#Duisburg features
selection_model.fit(X_du,du['demand'])
selection_model.coef_

array([ 0.        ,  2.40070401, -1.87670188,  1.60823414,  4.97359073,
       -0.47814174, -0.        ,  0.        ,  1.33925323, -0.        ])

Our feature selection shows that in both cities _dayofyear_ , _cloud_coverage_ , _wind_velocity_ and _is_holiday_ are not that important to our regression than the other features

Therefore we will use _week_, weekday , _hour_ , _temperature_ , _precipitation_ and _is_weekday_ as features for our regressions.