## Bike Sharing

Luis Garduno

-------------------------------------
    
Dataset [UCI ML Repositories]: __[Bike Sharing Dataset](https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip)__

Question Of Interest : Can you forecast the bike rental demand for the Washington DC Bike Sharing program?
    
-------------------------------------

## 1. Data Overview

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.1 Data Preparation
#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.1.1 Data Description

In [None]:
import os
import numpy as np
import pandas as pd

df = pd.read_csv('../data/bikes/hour.csv')

df.info()

In [None]:
df.describe()


#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.1.2 Normalizing the Dataset

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# --- Encode the Categorical, discrete variables ---
# OneHotEncode Season
# encoders = dict()
# cat_encoder = OneHotEncoder()
cat_attribs = ['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

# --- Scale the Numeric, continuous variables ---
num_attribs = ['temp', 'atemp', 'hum', 'windspeed']

#num_pipeline = Pipeline([
#    ('std_scaler', StandardScaler()),
#])
#full_pipeline = ColumnTransformer([
#    ("num", num_pipeline, num_attribs),
#    ("cat", OneHotEncoder(), cat_attribs),
#])
#bike_prepared = full_pipeline.fit_transform(df)



#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.1.3 Data Quality

In [None]:
import missingno as mn

mn.matrix(df, figsize=(12, 5))

# Count unique values in column 'instant' (record index) of the dataframe
print('\nNumber of unique values in column "instant" : ', df['instant'].nunique())

dup_df = df.replace(to_replace=-1,value=np.nan)

dup_df = dup_df.duplicated()
print('Duplicates : ', len(df[dup_df]), "\n")

---------------------------------------------

#### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.1.4 Cleaning the Dataset

In [None]:
#df = df.drop(['instant', 'dteday','casual', 'registered'], axis=1)
#df.info()

--------------------------------------


### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.2 Creating Training & Test Data

In [None]:
from sklearn.model_selection import train_test_split

# Split training & testing data using scikit-learn's function
#X = df.drop("cnt", axis=1)
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
print("Train:", len(train_set), "\nTest :", len(test_set))

-----------------------

## 2. Data Visualization

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 2.1 Correlation Matrix 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

cmap = sns.set(style="darkgrid", palette="colorblind")
attribs = num_attribs + cat_attribs + ['cnt']

plt.subplots(figsize=(14,12))
plt.pcolor(df[attribs].corr())
plt.xticks(np.arange(0, len(attribs), 1), attribs, rotation=90)
plt.yticks(np.arange(0, len(attribs), 1), attribs, rotation=90)
sns.heatmap(df[attribs].corr(), annot=True)
plt.show()

It is observed that variables <code>atemp</code> & <code>temp</code> are heavily correlated. 

The <code>atemp</code> variable is dropped since it only represents what the outside temperature _feels_ like.

In [None]:
''' ------ Row 1 ------ '''
plt.subplots(figsize=(22,5))

attribs = ['hr'] + ['weekday'] + ['cnt']

plt.subplot(1,3,1)
plt.hist2d(x=df.hr, y=df.cnt, bins=15)
plt.xlabel("Hour"); plt.ylabel("Count");
plt.colorbar()

plt.subplot(1,3,2)
plt.hist2d(x=df.weekday, y=df.cnt, bins=15)
plt.xlabel("Weekday"); plt.ylabel("Count");
plt.colorbar()

plt.subplot(1,3,3)
plt.hist2d(x=df.hr, y=df.weekday, bins=15)
plt.xlabel("Hour"); plt.ylabel("Weekday");
#plt.xticks(np.arange(0.5, 24, 1), labels=clk, rotation=90)
#plt.yticks(np.arange(0.5, 7, 1), labels=wkdays, rotation=0)
plt.colorbar()
plt.show()

''' ------ Row 2 ------ '''
plt.subplots(figsize=(22,5))

plt.subplot(1,3,1)
plt.hist2d(x=df.hr, y=df.cnt, bins=15)
plt.xlabel("Hour"); plt.ylabel("Count");
plt.colorbar()

plt.subplot(1,3,2)
plt.hist2d(x=df.hr, y=df.cnt, bins=15)
plt.xlabel("Hour"); plt.ylabel("Count");
plt.colorbar()


plt.subplot(1,3,3)
plt.hist2d(x=df.hr, y=df.cnt, bins=15)
plt.xlabel("Hour"); plt.ylabel("Count");
plt.colorbar()

plt.show()

In [None]:
df_2 = df.groupby(['weekday', 'hr']).mean().reset_index()
df_2 = df_2.pivot('weekday', 'hr', 'cnt')
wkdays = ['Su', 'Mon', 'Tu', 'Wed', 'Th', 'Fri', 'Sat']

f, ax = plt.subplots(figsize=(20, 5))
ax = sns.heatmap(df_2, cmap="RdYlGn_r", square=True, linewidths=0.5, cbar_kws={'label': 'Count'})
ax.set_xlabel('Hour of Day');   ax.set_ylabel('Day of Week')
ax.set_yticklabels(wkdays, rotation=0)
plt.show()

In [None]:
#obj_blue = pd.crosstab([df['mnth'],df['hr'],df['weekday']],df.cnt.astype(bool))

#ax_blue = obj_blue.plot(kind='bar', stacked=True, label='Game Outcome', color=['red', 'blue'], alpha=0.8)


#f, ax = plt.subplots(figsize=(20, 5))
#axs = sns.catplot(x="weekday", y="cnt", col='season', hue='weathersit', kind='strip', data=df)
g = sns.catplot(x="weekday", y="cnt", col="season", hue="weathersit", kind="strip", data=df)

(g.set_axis_labels("Day of Week", "Count")
  .set_titles("{col_var} {col_name}")
  .set_xticklabels(wkdays)
  .set(ylim=(0, 1000))
  .despine(left=True))

In [None]:
f, ax = plt.subplots(figsize=(20, 5))
ax = sns.violinplot(x="weekday", y="cnt", hue="workingday", data=df)
ax.set_xlabel('Day of Week');   ax.set_ylabel('Count')
ax.set_xticklabels(wkdays, rotation=0)
plt.title('Hi')
plt.show()

In [None]:
jitter_values = ['hr','weekday','cnt','weathersit']
df_jitter = df[jitter_values].copy()
df_jitter[['hr', 'weekday', 'cnt']] += np.random.rand(len(df_jitter),3)/2
ax = sns.pairplot(df_jitter, hue="weathersit", height=2, plot_kws=dict(s=20,alpha=0.15,linewidth=0), palette=['orange','blue', 'green', 'red'])
#plt.show()

--------------------------

## 3. Modeling

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 3.1 Training

-------------


### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 3.2 Testing