# Predictive Analytics

## Preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import plotly.express as px
import plotly.graph_objects as go

In [None]:
Rides = pd.read_csv("chicago_2018.csv", sep=",")

Rides["start_time"] = pd.to_datetime(Rides["start_time"])
Rides["end_time"] = pd.to_datetime(Rides["end_time"])

Rides.sort_values("start_time", inplace = True)
Rides.head(8)

The following cell is just for double checking the aggregated, hourly rental counts.

In [None]:
date1 = datetime.datetime(year=2018, month=1, day=1, hour = 5)
date2 = datetime.datetime(year=2018, month=1, day=1, hour = 6)

Rides[(Rides["start_time"] >= date1) & (Rides["start_time"] <= date2)]

Computing the hourly demand for 2018. The result is stored in a DataFrame of 24 by 365 = 8760 rows, neglecting time change

In [None]:
date_index = date1 + pd.to_timedelta(np.arange(8760), 'H')

Features = pd.DataFrame(index = date_index)

Features["demand"] = 0

Features = Rides.set_index("start_time")
Features = Features.resample('H').count()

Features.drop(columns = "start_station_id", inplace = True)
Features.drop(columns = "end_station_id", inplace = True)
Features.drop(columns = "start_station_name", inplace = True)
Features.drop(columns = "end_station_name", inplace = True)
Features.drop(columns = "bike_id", inplace = True)
Features.drop(columns = "user_type", inplace = True)

Features.rename(columns = {"end_time": "Rides"}, inplace = True)

The resulting df can be double checked with one of the cells above or with the sorted Rides df. It seems reasonable though:

In [None]:
Features.describe()

In [None]:
Features.head()

In [None]:
x = Features.iloc[0]["Rides"]
x

In [None]:
Features["Rides_last_hour"] = 0
value = Features.iloc[0]["Rides"]
for i in Features.index:     
        Features.loc[i,"Rides_last_hour"] = value
        value = Features.loc[i]["Rides"]

There seems to be some erroneous data in the weather data set as there are rows which exhibit the same date, leading pandas to crash. (e.g. for index 1662, if duplicates were not removed.)

In [None]:
Weather = pd.read_csv("weather_hourly_chicago.csv", sep=",")

Weather["avg_tmp"] = (Weather["max_temp"]+Weather["min_temp"])/2
Weather["is_raining"] = Weather["precip"] == 1
Weather.drop(columns = ["max_temp", "min_temp", "precip"], inplace=True)

Weather["date_time"] = pd.to_datetime(Weather["date_time"])

Weather.set_index("date_time", inplace = True)
Features = Features.join(Weather, on="start_time")

Lots of missing values for weather, imputation methods:

* numerical values: linear interpolation (time series data)
* categorical (is_raining): backwards-fill

In [None]:
Features.interpolate(inplace=True)
Features.fillna(method="bfill",inplace=True)

In [None]:
Features[Features["is_raining"].isnull()]

In [None]:
len(Features[Features["is_raining"] == True])

In [None]:
Features.describe()

In [None]:
Features.reset_index(inplace=True)

In [None]:
Features["is_workday"] = Features["start_time"].apply(lambda x: x.weekday() < 5)
Features["hour"] = Features["start_time"].apply(lambda x: x.hour)
Features["month"] = Features["start_time"].apply(lambda x: x.month)

In [None]:
def getSeason(month):
    
    Winter = [12, 1, 2]
    Spring = [3, 4, 5]
    Summer = [6, 7, 8]
    Fall = [9, 10, 11]
    
    if month in Winter:
        return 1
    elif month in Spring:
        return 2
    elif month in Summer:
        return 3
    elif month in Fall:
        return 4
    
Features["season"] = Features["month"].apply(lambda month: getSeason(month))

In [None]:
seasons = pd.get_dummies(Features["season"],prefix="season_")
seasons.drop(columns="season__4", inplace=True)

In [None]:
Features[list(seasons.columns)] = seasons

In [None]:
hours = pd.get_dummies(Features["hour"],prefix="hour_")
hours.drop(columns="hour__23", inplace=True)

In [None]:
Features[list(hours.columns)] = hours

In [None]:
Features.drop(columns=["season","month","hour"], inplace=True)

In [None]:
Features

Re-Scaling the data

In [None]:
Features_rescaled = pd.DataFrame()

Features_rescaled["Rides"] = (Features["Rides"] - Features["Rides"].min()) / (Features["Rides"].max() - Features["Rides"].min())
Features_rescaled["Rides_last_hour"] = (Features["Rides_last_hour"] - Features["Rides_last_hour"].min()) / (Features["Rides_last_hour"].max() - Features["Rides_last_hour"].min())
Features_rescaled["Max_temp"] = (Features["Max_temp"] - Features["Max_temp"].min()) / (Features["Max_temp"].max() - Features["Max_temp"].min())
Features_rescaled["Min_temp"] = (Features["Min_temp"] - Features["Min_temp"].min()) / (Features["Min_temp"].max() - Features["Min_temp"].min())
Features_rescaled["Precipitation"] = (Features["Precipitation"] - Features["Precipitation"].min()) / (Features["Precipitation"].max() - Features["Precipitation"].min())
Features_rescaled["Day_of_Week"] = (Features["Day_of_Week"] - Features["Day_of_Week"].min()) / (Features["Day_of_Week"].max() - Features["Day_of_Week"].min())
Features_rescaled["Hour"] = (Features["Hour"] - Features["Hour"].min()) / (Features["Hour"].max() - Features["Hour"].min())
Features_rescaled["Month"] = (Features["Month"] - Features["Month"].min()) / (Features["Month"].max() - Features["Month"].min())
Features_rescaled["Season"] = (Features["Season"] - Features["Season"].min()) / (Features["Season"].max() - Features["Season"].min())

Features_rescaled.head()

In [None]:

fig = go.Figure()
fig.add_trace(go.Scatter(x=Features["start_time"], y=Features_rescaled["Rides"],
                    mode='lines',
                    name='Demand'))
fig.add_trace(go.Scatter(x=Features["start_time"], y=Features_rescaled["Max_temp"],
                    mode='lines+markers',
                    name='Max. Temperature'))

fig.show()

In [None]:
sns.jointplot(x = "avg_tmp", y = "Rides", data = Features, kind = "hex", height=10, palette = "magma")

In [None]:
sns.pairplot(Features, palette="magma", height=3, hue="is_raining")
plt.show()

In [None]:
Features.cov()

In [None]:
Features_corr = Features.corr()
Features_corr

In [None]:
sns.heatmap(Features_corr, 
        xticklabels=Features_corr.columns,
        yticklabels=Features_corr.columns)

[Source](https://stackoverflow.com/questions/39409866/correlation-heatmap) for the following code:

In [None]:
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

Features_corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:
Features_rescaled.var()

In [None]:
Features_sample = Features[3100:3220]
Features_sample = Features_sample[Features["avg_tmp"] != 0]
#Features_sample = Features.sample(n=125)
#Features_sample = Features_sample[Features["Max_temp"] != 0]

sns.scatterplot(x = Features_sample["avg_tmp"], y = Features_sample["Rides"], hue = Features_sample["is_raining"])

In [None]:
fig = px.density_heatmap(Features, x='avg_tmp', y='Rides', width=600, height=600,
                      title='Correlation between Temperature and Demand', color_continuous_scale=[[0.0, 'white'], [1.0, 'red']],
                        nbinsx=25, nbinsy=25)
fig.show()

In [None]:
fig = px.parallel_coordinates(data_frame = Features, dimensions =[""])

fig.show()

In [None]:
fig = px.scatter_3d(data_frame = Features_sample, z='Max_temp', x='Rides', y='Hour', color='Precipitation', opacity=0.4, size_max=5)

fig.show()

In [None]:
fig = px.scatter_3d(data_frame = Features_sample, z='Max_temp', x='Rides', y='Rides_last_hour', color='Precipitation', opacity=0.4, size_max=5)

fig.show()

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# split the data with 70-30% split as above

X = Features[Features.columns[(Features.columns != "Rides") & (Features.columns != "start_time")]]
y = Features["Rides"]
X,y
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
X

In [None]:
x_train, x_test, y_train, y_test

fangt hier an...