In [10]:
import pandas as pd
import requests

In [11]:
# URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00560/SeoulBikeData.csv"
URL = "./datasets/raw_data.csv"

In [12]:
orig_df = pd.read_csv(URL,encoding='latin1')
df = pd.DataFrame()
orig_df.columns

Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [4]:
orig_df["Date"] = pd.to_datetime(orig_df["Date"], dayfirst=True)

In [5]:
orig_df.dtypes

Date                         datetime64[ns]
Rented Bike Count                     int64
Hour                                  int64
Temperature(°C)                     float64
Humidity(%)                           int64
Wind speed (m/s)                    float64
Visibility (10m)                      int64
Dew point temperature(°C)           float64
Solar Radiation (MJ/m2)             float64
Rainfall(mm)                        float64
Snowfall (cm)                       float64
Seasons                              object
Holiday                              object
Functioning Day                      object
dtype: object

In [6]:
# Recreate data from study

# TODO: rename variables to remain consistent

# Include all numeric
df = pd.concat([df, orig_df.select_dtypes(include=["int64", "float64"])], join="outer")
rename_dict = {
    "Rented Bike Count":"Count",
    "Temperature(°C)":"Temp",
    "Humidity(%)":"Hum",
    "Wind speed (m/s)": "Wind",
    "Visibility (10m)": "Visb",
    "Dew point temperature(°C)":"Dew",
    "Solar Radiation (MJ/m2)": "Solar",
    "Rainfall(mm)":"Rainfall",
    "Snowfall (cm)":"Snow",
    "Functioning Day":"Fday"
}
# Pandas has week start on monday
weekday_list = ["Monday", "Tuesday", "Wedesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Sanity Check
# print(df["Weekend"].value_counts())
# print(df["DWeek"].value_counts())

df.rename(rename_dict,axis=1, inplace=True)

# Add variables from date 
df["DWeek"] = orig_df["Date"].dt.dayofweek.map(lambda x: weekday_list[x])
df["Weekend"] = orig_df["Date"].dt.dayofweek.map(lambda x: x ==5 or x==6)
df["Holiday"] = orig_df["Holiday"].map(lambda x: x == "Holiday")
df["Seasons"] = orig_df["Seasons"]
df

Unnamed: 0,Count,Hour,Temp,Hum,Wind,Visb,Dew,Solar,Rainfall,Snow,DWeek,Weekend,Holiday,Seasons
0,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Friday,False,False,Winter
1,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Friday,False,False,Winter
2,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Friday,False,False,Winter
3,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Friday,False,False,Winter
4,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Friday,False,False,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,1003,19,4.2,34,2.6,1894,-10.3,0.0,0.0,0.0,Friday,False,False,Autumn
8756,764,20,3.4,37,2.3,2000,-9.9,0.0,0.0,0.0,Friday,False,False,Autumn
8757,694,21,2.6,39,0.3,1968,-9.9,0.0,0.0,0.0,Friday,False,False,Autumn
8758,712,22,2.1,41,1.0,1859,-9.8,0.0,0.0,0.0,Friday,False,False,Autumn


In [7]:
df_encoded = pd.get_dummies(df, dtype="bool")
df_encoded.drop(["Dew"],axis=1, inplace=True) # High correlation with temp
df_encoded.drop(["Solar"],axis=1, inplace=True) # no need

In [8]:
df_encoded.dtypes

Count               int64
Hour                int64
Temp              float64
Hum                 int64
Wind              float64
Visb                int64
Rainfall          float64
Snow              float64
Weekend              bool
Holiday              bool
DWeek_Friday         bool
DWeek_Monday         bool
DWeek_Saturday       bool
DWeek_Sunday         bool
DWeek_Thursday       bool
DWeek_Tuesday        bool
DWeek_Wedesday       bool
Seasons_Autumn       bool
Seasons_Spring       bool
Seasons_Summer       bool
Seasons_Winter       bool
dtype: object

In [9]:
df_encoded.to_csv("./datasets/bike_data.csv")

In [12]:
df_encoded

Unnamed: 0,Count,Hour,Temp,Hum,Wind,Visb,Rainfall,Snow,Weekend,Holiday,...,DWeek_Monday,DWeek_Saturday,DWeek_Sunday,DWeek_Thursday,DWeek_Tuesday,DWeek_Wedesday,Seasons_Autumn,Seasons_Spring,Seasons_Summer,Seasons_Winter
0,254,0,-5.2,37,2.2,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
1,204,1,-5.5,38,0.8,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
2,173,2,-6.0,39,1.0,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
3,107,3,-6.2,40,0.9,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
4,78,4,-6.0,36,2.3,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,1003,19,4.2,34,2.6,1894,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
8756,764,20,3.4,37,2.3,2000,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
8757,694,21,2.6,39,0.3,1968,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
8758,712,22,2.1,41,1.0,1859,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
