In [1]:
import pandas as pd
import requests

In [2]:
# URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv"
URL = "./datasets/raw_data.csv"

TEST_PERCENTAGE = 0.3


In [3]:
orig_df = pd.read_csv(URL,encoding='latin1')
df = pd.DataFrame()
orig_df.columns

Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [4]:
orig_df["Date"] = pd.to_datetime(orig_df["Date"], dayfirst=True)

In [5]:
orig_df.dtypes

Date                         datetime64[ns]
Rented Bike Count                     int64
Hour                                  int64
Temperature(°C)                     float64
Humidity(%)                           int64
Wind speed (m/s)                    float64
Visibility (10m)                      int64
Dew point temperature(°C)           float64
Solar Radiation (MJ/m2)             float64
Rainfall(mm)                        float64
Snowfall (cm)                       float64
Seasons                              object
Holiday                              object
Functioning Day                      object
dtype: object

In [6]:
# Recreate data from study

# TODO: rename variables to remain consistent

# Include all numeric
df = pd.concat([df, orig_df.select_dtypes(include=["int64", "float64"])], join="outer")
rename_dict = {
    "Rented Bike Count":"Count",
    "Temperature(°C)":"Temp",
    "Humidity(%)":"Hum",
    "Wind speed (m/s)": "Wind",
    "Visibility (10m)": "Visb",
    "Dew point temperature(°C)":"Dew",
    "Solar Radiation (MJ/m2)": "Solar",
    "Rainfall(mm)":"Rainfall",
    "Snowfall (cm)":"Snow",
    "Functioning Day":"Fday"
}
# Pandas has week start on monday
weekday_list = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Sanity Check
# print(df["Weekend"].value_counts())
# print(df["DWeek"].value_counts())

df.rename(rename_dict,axis=1, inplace=True)

# Add variables from date 
df["DWeek"] = orig_df["Date"].dt.dayofweek.map(lambda x: weekday_list[x])
df["Weekend"] = orig_df["Date"].dt.dayofweek.map(lambda x: x ==5 or x==6)
df["Holiday"] = orig_df["Holiday"].map(lambda x: x == "Holiday")
df["Seasons"] = orig_df["Seasons"]
df["Fday"] = orig_df["Functioning Day"].map(lambda x: x == "Yes")
df

Unnamed: 0,Count,Hour,Temp,Hum,Wind,Visb,Dew,Solar,Rainfall,Snow,DWeek,Weekend,Holiday,Seasons,Fday
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Friday,False,False,Winter,True
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Friday,False,False,Winter,True
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Friday,False,False,Winter,True
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Friday,False,False,Winter,True
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Friday,False,False,Winter,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Friday,False,False,Autumn,True
8756,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Friday,False,False,Autumn,True
8757,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Friday,False,False,Autumn,True
8758,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Friday,False,False,Autumn,True


In [7]:
import seaborn as sn
import matplotlib.pyplot as plt
def show_heatmap():
    #variables for reshaping seaborn heatmap 
    fig_dims = (20, 20)
    fig, ax = plt.subplots(figsize=fig_dims)

    corr = df.corr()

    sn.heatmap(corr, annot=True, ax=ax)
    plt.show()
# show_heatmap()

In [8]:
df_encoded = pd.get_dummies(df, dtype="bool")
df_encoded.drop(["Dew"],axis=1, inplace=True) # High correlation with temp
df_encoded.drop(["Solar"],axis=1, inplace=True) # no need

In [9]:
df_encoded.dtypes

Count                int64
Hour                 int64
Temp               float64
Hum                  int64
Wind               float64
Visb                 int64
Rainfall           float64
Snow               float64
Weekend               bool
Holiday               bool
Fday                  bool
DWeek_Friday          bool
DWeek_Monday          bool
DWeek_Saturday        bool
DWeek_Sunday          bool
DWeek_Thursday        bool
DWeek_Tuesday         bool
DWeek_Wednesday       bool
Seasons_Autumn        bool
Seasons_Spring        bool
Seasons_Summer        bool
Seasons_Winter        bool
dtype: object

In [10]:
df_encoded.to_csv("./datasets/bike_data.csv")

In [11]:
df = df_encoded.copy()
df

Unnamed: 0,Count,Hour,Temp,Hum,Wind,Visb,Rainfall,Snow,Weekend,Holiday,...,DWeek_Monday,DWeek_Saturday,DWeek_Sunday,DWeek_Thursday,DWeek_Tuesday,DWeek_Wednesday,Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,254,0,-5.2,37,2.2,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
1,204,1,-5.5,38,0.8,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
2,173,2,-6.0,39,1.0,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
3,107,3,-6.2,40,0.9,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
4,78,4,-6.0,36,2.3,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,1003,19,4.2,34,2.6,1894,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
8756,764,20,3.4,37,2.3,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
8757,694,21,2.6,39,0.3,1968,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
8758,712,22,2.1,41,1.0,1859,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False


## Setup Train and Test

In [12]:
from sklearn.model_selection import train_test_split

X = df[df["Hum"] != 0].drop("Count", axis=1).values
y = df[df["Hum"] != 0]["Count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_PERCENTAGE, random_state=42)

## Normalize

In [13]:
from joblib import dump, load
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
# Save MMS for later use
dump(scaler, './models/min_max_scaler.joblib') 

X_test = scaler.transform(X_test)

In [14]:
train_df = pd.DataFrame(X_train, columns=df.drop("Count",axis=1).columns)
test_df = pd.DataFrame(X_test, columns=df.drop("Count",axis=1).columns)

# Reset indices for data
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

# Reset indices for dependent var
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)


train_df["Count"] = y_train
test_df["Count"] = y_test

In [15]:
train_df

Unnamed: 0,Hour,Temp,Hum,Wind,Visb,Rainfall,Snow,Weekend,Holiday,Fday,...,DWeek_Saturday,DWeek_Sunday,DWeek_Thursday,DWeek_Tuesday,DWeek_Wednesday,Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter,Count
0,0.913043,0.267606,0.310345,0.310811,1.000000,0.000000,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,133
1,0.217391,0.517606,0.839080,0.202703,0.165226,0.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,223
2,0.260870,0.367958,0.850575,0.202703,0.102186,0.000000,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,63
3,0.739130,0.441901,0.137931,0.391892,0.836299,0.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1099
4,0.434783,0.359155,0.126437,0.270270,0.994408,0.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6115,0.652174,0.816901,0.862069,0.297297,0.792578,0.247458,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,468
6116,0.000000,0.721831,0.620690,0.256757,1.000000,0.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1075
6117,0.304348,0.746479,0.804598,0.040541,0.638536,0.000000,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,537
6118,0.869565,0.248239,0.459770,0.148649,0.690391,0.000000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,322


In [16]:
test_df

Unnamed: 0,Hour,Temp,Hum,Wind,Visb,Rainfall,Snow,Weekend,Holiday,Fday,...,DWeek_Saturday,DWeek_Sunday,DWeek_Thursday,DWeek_Tuesday,DWeek_Wednesday,Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter,Count
0,0.608696,0.845070,0.551724,0.270270,1.000000,0.00339,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,740
1,0.043478,0.566901,0.666667,0.081081,1.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,0.608696,0.772887,0.367816,0.283784,0.971530,0.00000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1027
3,0.782609,0.174296,0.287356,0.445946,1.000000,0.00000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,392
4,0.521739,0.628521,0.045977,0.216216,1.000000,0.00000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618,1.000000,0.290493,0.505747,0.256757,1.000000,0.00000,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,169
2619,0.913043,0.380282,0.505747,0.162162,0.719370,0.00000,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,377
2620,0.173913,0.112676,0.448276,0.162162,0.992883,0.00000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,33
2621,0.869565,0.205986,0.379310,0.283784,0.996950,0.00000,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,218


In [17]:
train_df.to_csv("./datasets/bike_train.csv")

In [18]:
test_df.to_csv("./datasets/bike_test.csv")

In [19]:
X_total = scaler.transform(df[df["Hum"] != 0].drop("Count",axis=1).values)
bike_data_normalized = pd.DataFrame(X_total, columns=df.drop("Count",axis=1).columns)
bike_data_normalized["Count"] = df[df["Hum"] != 0]["Count"]
bike_data_normalized.reset_index(drop=True, inplace=True)
bike_data_normalized.to_csv("./datasets/bike_data_normalized.csv")
bike_data_normalized

Unnamed: 0,Hour,Temp,Hum,Wind,Visb,Rainfall,Snow,Weekend,Holiday,Fday,...,DWeek_Saturday,DWeek_Sunday,DWeek_Thursday,DWeek_Tuesday,DWeek_Wednesday,Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter,Count
0,0.000000,0.216549,0.298851,0.297297,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,254.0
1,0.043478,0.211268,0.310345,0.108108,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,204.0
2,0.086957,0.202465,0.321839,0.135135,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,173.0
3,0.130435,0.198944,0.333333,0.121622,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,107.0
4,0.173913,0.202465,0.287356,0.310811,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,78.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8738,0.826087,0.382042,0.264368,0.351351,0.946111,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,300.0
8739,0.869565,0.367958,0.298851,0.310811,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,213.0
8740,0.913043,0.353873,0.321839,0.040541,0.983732,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,116.0
8741,0.956522,0.345070,0.344828,0.135135,0.928317,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,149.0


In [20]:
bike_data_normalized.to_csv("./datasets/bike_data_normalized.csv")

In [21]:
bike_data_normalized.columns

Index(['Hour', 'Temp', 'Hum', 'Wind', 'Visb', 'Rainfall', 'Snow', 'Weekend',
       'Holiday', 'Fday', 'DWeek_Friday', 'DWeek_Monday', 'DWeek_Saturday',
       'DWeek_Sunday', 'DWeek_Thursday', 'DWeek_Tuesday', 'DWeek_Wednesday',
       'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter',
       'Count'],
      dtype='object')

In [22]:
X_total = scaler.transform(df.drop("Count",axis=1).values)
time_series_data = pd.DataFrame(X_total, columns=df.drop("Count",axis=1).columns)
time_series_data["Count"] = df["Count"]
time_series_data.to_csv("./datasets/time_series_data.csv")
time_series_data

Unnamed: 0,Hour,Temp,Hum,Wind,Visb,Rainfall,Snow,Weekend,Holiday,Fday,...,DWeek_Saturday,DWeek_Sunday,DWeek_Thursday,DWeek_Tuesday,DWeek_Wednesday,Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter,Count
0,0.000000,0.216549,0.298851,0.297297,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,254
1,0.043478,0.211268,0.310345,0.108108,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,204
2,0.086957,0.202465,0.321839,0.135135,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,173
3,0.130435,0.198944,0.333333,0.121622,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,107
4,0.173913,0.202465,0.287356,0.310811,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,0.826087,0.382042,0.264368,0.351351,0.946111,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1003
8756,0.869565,0.367958,0.298851,0.310811,1.000000,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,764
8757,0.913043,0.353873,0.321839,0.040541,0.983732,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,694
8758,0.956522,0.345070,0.344828,0.135135,0.928317,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,712
