# Bike Sharing Demand
## Michał Binda, Mikołaj Mróz, Paweł Swiderski

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pylab
import calendar
import seaborn as sn
from scipy import stats
# import missingno as msno
from datetime import datetime
import warnings
%matplotlib inline

In [2]:
df_train = pd.read_csv('train.csv',header = 0)

In [11]:
X_train = df_train.iloc[:,:-1] 
y_train = df_train["count"]

## Data exploration

In [14]:
df_train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


#### **Data Fields**

* datetime - hourly date + timestamp  
* season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
* holiday - whether the day is considered a holiday
* workingday - whether the day is neither a weekend nor holiday
* weather - 
    * 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    * 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    * 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    * 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 
* temp - temperature in Celsius
* atemp - "feels like" temperature in Celsius
* humidity - relative humidity
* windspeed - wind speed
* casual - number of non-registered user rentals initiated
* registered - number of registered user rentals initiated
* count - number of total rentals (Dependent Variable)

In [50]:
df_train.shape

(10886, 19)

In [51]:
#checking unique values in each column
for col in df_train:
    print(f"Value counts for column '{col}':")
    print(df_train[col].value_counts())
    print("\n")

Value counts for column 'datetime':
2011-01-01 00:00:00    1
2012-05-01 21:00:00    1
2012-05-01 13:00:00    1
2012-05-01 14:00:00    1
2012-05-01 15:00:00    1
                      ..
2011-09-02 04:00:00    1
2011-09-02 05:00:00    1
2011-09-02 06:00:00    1
2011-09-02 07:00:00    1
2012-12-19 23:00:00    1
Name: datetime, Length: 10886, dtype: int64


Value counts for column 'season':
4    2734
2    2733
3    2733
1    2686
Name: season, dtype: int64


Value counts for column 'holiday':
0    10575
1      311
Name: holiday, dtype: int64


Value counts for column 'workingday':
1    7412
0    3474
Name: workingday, dtype: int64


Value counts for column 'weather':
1    7192
2    2834
3     859
4       1
Name: weather, dtype: int64


Value counts for column 'temp':
14.76    467
26.24    453
28.70    427
13.94    413
18.86    406
22.14    403
25.42    403
16.40    400
22.96    395
27.06    394
24.60    390
12.30    385
21.32    362
17.22    356
13.12    356
29.52    353
10.66    332
18.0

In [52]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   datetime    10886 non-null  object  
 1   season      10886 non-null  category
 2   holiday     10886 non-null  category
 3   workingday  10886 non-null  category
 4   weather     10886 non-null  category
 5   temp        10886 non-null  float64 
 6   atemp       10886 non-null  float64 
 7   humidity    10886 non-null  int64   
 8   windspeed   10886 non-null  float64 
 9   casual      10886 non-null  int64   
 10  registered  10886 non-null  int64   
 11  count       10886 non-null  int64   
 12  tempDate    10886 non-null  object  
 13  year        10886 non-null  object  
 14  month       10886 non-null  category
 15  day         10886 non-null  object  
 16  date        10886 non-null  object  
 17  hour        10886 non-null  category
 18  weekday     10886 non-null  category
dtypes: c

Dataframe contains of multiple columns, we will predict 'count' column. In order to do it we have to think of a way of handling object values

In [53]:
print(df_train['datetime'].value_counts())

2011-01-01 00:00:00    1
2012-05-01 21:00:00    1
2012-05-01 13:00:00    1
2012-05-01 14:00:00    1
2012-05-01 15:00:00    1
                      ..
2011-09-02 04:00:00    1
2011-09-02 05:00:00    1
2011-09-02 06:00:00    1
2011-09-02 07:00:00    1
2012-12-19 23:00:00    1
Name: datetime, Length: 10886, dtype: int64


In [54]:
def modify_datetime(df):
    df["date"] = df.datetime.apply(lambda x : x.split()[0])
    df["hour"] = df.datetime.apply(lambda x : x.split()[1].split(":")[0])
    df["weekday"] = df.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,"%Y-%m-%d").weekday()])
    df["month"] = df.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,"%Y-%m-%d").month])
    
    
    
    return df

In [55]:
def modify_to_category(df):
    categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday"]
    for var in categoryVariableList:
        df[var] = df[var].astype("category")
    
    return df

In [72]:
def drop_datetime(df):
    
    return df.drop(['datetime'],axis=1)
    

In [73]:
df_train2 = modify_datetime(df_train)
df_train3 = modify_to_category(df_train2)
df_train4 = drop_datetime(df_train3)

In [58]:
df_train2.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,tempDate,year,month,day,date,hour,weekday
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,"[2011-01-01, 00:00:00]",2011,January,1,2011-01-01,0,Saturday
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,"[2011-01-01, 01:00:00]",2011,January,1,2011-01-01,1,Saturday
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,"[2011-01-01, 02:00:00]",2011,January,1,2011-01-01,2,Saturday
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,"[2011-01-01, 03:00:00]",2011,January,1,2011-01-01,3,Saturday
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,"[2011-01-01, 04:00:00]",2011,January,1,2011-01-01,4,Saturday


In [59]:
df_train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   datetime    10886 non-null  object  
 1   season      10886 non-null  category
 2   holiday     10886 non-null  category
 3   workingday  10886 non-null  category
 4   weather     10886 non-null  category
 5   temp        10886 non-null  float64 
 6   atemp       10886 non-null  float64 
 7   humidity    10886 non-null  int64   
 8   windspeed   10886 non-null  float64 
 9   casual      10886 non-null  int64   
 10  registered  10886 non-null  int64   
 11  count       10886 non-null  int64   
 12  tempDate    10886 non-null  object  
 13  year        10886 non-null  object  
 14  month       10886 non-null  category
 15  day         10886 non-null  object  
 16  date        10886 non-null  object  
 17  hour        10886 non-null  category
 18  weekday     10886 non-null  category
dtypes: c

In [74]:
df_train4.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,tempDate,year,month,day,date,hour,weekday
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,"[2011-01-01, 00:00:00]",2011,January,1,2011-01-01,0,Saturday
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,"[2011-01-01, 01:00:00]",2011,January,1,2011-01-01,1,Saturday
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,"[2011-01-01, 02:00:00]",2011,January,1,2011-01-01,2,Saturday
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,"[2011-01-01, 03:00:00]",2011,January,1,2011-01-01,3,Saturday
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,"[2011-01-01, 04:00:00]",2011,January,1,2011-01-01,4,Saturday
