<img title="GitHub Octocat" src='./img/Octocat.jpg' style='height: 60px; padding-right: 15px' alt="Octocat" align="left"> This notebook is part of a GitHub repository: https://github.com/pessini/moby-bikes 
<br>MIT Licensed
<br>Author: Leandro Pessini

# <p style="font-size:100%; text-align:left; color:#444444;">Exploratory Data Analysis (EDA)</p>

# <p style="font-size:100%; text-align:left; color:#444444;">Table of Contents:</p>
* [1. Datasets](#1)
  * [1.1 Rentals Data - Moby Bikes](#1.1)
  * [1.2 Weather Data - Met Éireann](#1.2)
* [2. Preprocessing & Feature Engineering](#2)
  * [2.1 Target variable distribution](#2.1)
  * [2.2 Missing values](#2.2)
  * [2.3 Exploratory Analysis](#2.3)
  * [2.4 Features Importance](#2.4)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
hourly_data = pd.read_csv('../data/interim/hourly_data.csv')
all_data = pd.read_csv('../data/interim/all_data.csv')
rentals = pd.read_csv('../data/interim/new_features_rentals.csv')
hourly_data.head()

Unnamed: 0,rental_date,rental_hour,rental_day,rental_month,rental_year,holiday,dayofweek_n,dayofweek,working_day,season,peak,timesofday,rain,temp,rhum,wdsp,count
0,2021-02-01,6,1,2,2021,False,0,Monday,True,Winter,True,Night,0.0,3.4,98.0,3,1
1,2021-02-01,8,1,2,2021,False,0,Monday,True,Winter,True,Morning,0.0,3.5,93.0,4,2
2,2021-02-01,9,1,2,2021,False,0,Monday,True,Winter,True,Morning,0.0,2.6,93.0,2,4
3,2021-02-01,10,1,2,2021,False,0,Monday,True,Winter,True,Morning,0.0,4.1,97.0,4,3
4,2021-02-01,11,1,2,2021,False,0,Monday,True,Winter,False,Morning,0.0,5.2,86.0,6,12


In [4]:
all_data.head(3)

Unnamed: 0,lastrentalstart,bikeid,coordinates,start_battery,lastgpstime,rental_date,rental_hour,rental_day,rental_month,rental_year,...,dayofweek,working_day,season,peak,timesofday,date,rain,temp,rhum,wdsp
0,2021-02-01 06:58:08,41,"[[53.3292, -6.23173], [53.3292, -6.23177], [53...",37.0,2021-02-01 14:45:59,2021-02-01 00:00:00,6,1,2,2021,...,Monday,True,Winter,True,Night,2021-02-01 06:00:00,0.0,3.4,98.0,3
1,2021-02-01 08:00:38,83,"[[53.3425, -6.29327], [53.3425, -6.29327], [53...",68.0,2021-02-01 16:15:25,2021-02-01 00:00:00,8,1,2,2021,...,Monday,True,Winter,True,Morning,2021-02-01 08:00:00,0.0,3.5,93.0,4
2,2021-02-01 08:51:20,100,"[[53.3558, -6.25753], [53.3558, -6.25753], [53...",85.0,2021-02-04 11:23:41,2021-02-01 00:00:00,8,1,2,2021,...,Monday,True,Winter,True,Morning,2021-02-01 08:00:00,0.0,3.5,93.0,4


In [5]:
rentals.head(3)

Unnamed: 0,lastrentalstart,bikeid,coordinates,start_battery,lastgpstime,rental_date,rental_hour,rental_day,rental_month,rental_year,duration,holiday,dayofweek_n,dayofweek,working_day,season,peak,timesofday
0,2021-02-01 06:58:08,41,"[[53.3292, -6.23173], [53.3292, -6.23177], [53...",37.0,2021-02-01 14:45:59,2021-02-01 00:00:00,6,1,2,2021,467.85,False,0,Monday,True,Winter,True,Night
1,2021-02-01 08:00:38,83,"[[53.3425, -6.29327], [53.3425, -6.29327], [53...",68.0,2021-02-01 16:15:25,2021-02-01 00:00:00,8,1,2,2021,494.783333,False,0,Monday,True,Winter,True,Morning
2,2021-02-01 08:51:20,100,"[[53.3558, -6.25753], [53.3558, -6.25753], [53...",85.0,2021-02-04 11:23:41,2021-02-01 00:00:00,8,1,2,2021,4472.35,False,0,Monday,True,Winter,True,Morning


In [6]:
hourly_data[['temp','rain','wdsp','rhum','count']].describe()

Unnamed: 0,temp,rain,wdsp,rhum,count
count,3955.0,3955.0,3955.0,3955.0,3955.0
mean,11.536056,0.056384,8.683692,77.081416,5.360556
std,5.190961,0.328447,4.303582,13.254176,3.854651
min,-4.0,0.0,1.0,24.0,1.0
25%,7.9,0.0,6.0,68.0,2.0
50%,11.6,0.0,8.0,78.0,5.0
75%,15.3,0.0,11.0,88.0,8.0
max,26.3,10.3,26.0,100.0,26.0


In [7]:
hourly_data = hourly_data.astype({'holiday': 'category', 
                                  'dayofweek': 'category', 
                                  'working_day': 'category',
                                  'season': 'category', 
                                  'peak': 'category', 
                                  'timesofday': 'category'
                                  })

In [8]:
hourly_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3955 entries, 0 to 3954
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   rental_date   3955 non-null   object  
 1   rental_hour   3955 non-null   int64   
 2   rental_day    3955 non-null   int64   
 3   rental_month  3955 non-null   int64   
 4   rental_year   3955 non-null   int64   
 5   holiday       3955 non-null   category
 6   dayofweek_n   3955 non-null   int64   
 7   dayofweek     3955 non-null   category
 8   working_day   3955 non-null   category
 9   season        3955 non-null   category
 10  peak          3955 non-null   category
 11  timesofday    3955 non-null   category
 12  rain          3955 non-null   float64 
 13  temp          3955 non-null   float64 
 14  rhum          3955 non-null   float64 
 15  wdsp          3955 non-null   int64   
 16  count         3955 non-null   int64   
dtypes: category(6), float64(3), int64(7), object(1)
memo

In [9]:
hourly_data[['holiday','dayofweek','working_day','season','peak','timesofday']].describe()

Unnamed: 0,holiday,dayofweek,working_day,season,peak,timesofday
count,3955,3955,3955,3955,3955,3955
unique,2,7,2,3,2,4
top,False,Saturday,True,Spring,False,Afternoon
freq,3785,582,2675,1704,2587,1258


In [10]:
from pandas_profiling import ProfileReport
profile = ProfileReport(hourly_data, title='Hourly Data', html={'style':{'full_width':True}})
profile.to_notebook_iframe()
profile.to_file(output_file='hourly_data_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Rentals by Season

In [11]:
# 0 - Spring | 1 - Summer | 2 - Autumn | 3 - Winter
# season_map = {0:'Spring', 1:'Summer', 2:'Fall', 3:'Winter'}
season_freq = hourly_data.groupby(['season', 'rental_hour'])['count'].agg('sum').reset_index(name='count')
# season_freq['season'] = season_freq['season'].map(lambda d : season_map[d])
fig, ax = plt.subplots(figsize=(16, 10))
sns.pointplot(x=season_freq["rental_hour"], y=season_freq["count"], hue=season_freq["season"], ax=ax)
ax.set(xlabel='Hour Of The Day', ylabel='Rentals Count', title="Number of Rentals By Hour Of The Day Across Seasons")
plt.show()

  plt.show()


In [12]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=season_freq.groupby('season')['count'].sum().reset_index(), x='count', y='season', order=['Winter','Spring','Summer','Autumn'], ci=None)
ax.set(xlabel='Number of Rentals', ylabel='Season', title='Rentals across all seasons')
plt.show()

  plt.show()


### Rentals by Days of the Week

In [13]:
# day_of_week_map = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
day_of_week = hourly_data.groupby(['dayofweek', 'rental_hour'])['count'].agg('sum').reset_index(name='count')
# day_of_week['dayofweek'] = day_of_week['dayofweek'].map(lambda d : day_of_week_map[d])
fig, ax = plt.subplots(figsize=(16, 10))
sns.pointplot(x=day_of_week["rental_hour"], 
              y=day_of_week["count"], 
              hue=day_of_week["dayofweek"], 
              ax=ax,
              hue_order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
ax.set(xlabel='Hour Of The Day', ylabel='Rentals Count', title="Number of Rentals By Hour Of The Day Across Days of Week")
plt.show()

  plt.show()


In [14]:
hourly_data.groupby('rental_month')['count'].agg('sum').reset_index(name='count')

Unnamed: 0,rental_month,count
0,2,2484
1,3,3287
2,4,3050
3,5,2778
4,6,3425
5,7,3134
6,8,3043


In [15]:
monthly_data = hourly_data.groupby('rental_month')['count'].agg('sum').reset_index(name='count')
# day_of_week['dayofweek'] = day_of_week['dayofweek'].map(lambda d : day_of_week_map[d])
fig, ax = plt.subplots(figsize=(16, 10))
sns.pointplot(x=monthly_data["rental_month"], 
              y=monthly_data["count"], 
              ax=ax)
ax.set(xlabel='Month', ylabel='Rentals Count', title="Number of Rentals By Month")
plt.show()

  plt.show()


In [16]:
day_of_week = hourly_data.groupby('dayofweek')['count'].agg('sum').reset_index(name='count')
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=day_of_week, x='count', y='dayofweek', ci=None, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
ax.set(xlabel='Number of Rentals', ylabel='Day of Week', title='Rentals across all days of week')
plt.show()

  plt.show()


In [17]:
hourly_count = hourly_data.groupby(['rental_hour','holiday'])['count'].agg('mean').reset_index(name='count')

fig, ax = plt.subplots(figsize=(10, 8))
sns.pointplot(x=hourly_count['rental_hour'], y=hourly_count['count'], hue=hourly_count['holiday'], ax=ax)
ax.set(xlabel='Hour Of The Day', 
       ylabel='Rentals Count', 
       title="Avg Rentals By Hour Of The Day by Holiday")
plt.show()

  plt.show()


In [18]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(data=hourly_count, x='holiday', y='count', ci=None)
ax.set(xlabel='Peak Hour', ylabel='Number of Rentals', title='Rentals across peak hours')
plt.show()

  plt.show()


In [19]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=hourly_data, x='count', y='timesofday', ci=None, order=['Morning','Afternoon','Evening','Night'])
ax.set(xlabel='Number of Rentals', ylabel='Period of the Day', title='Rentals across Times of the Day')
plt.show()

  plt.show()


In [20]:
fig, ax = plt.subplots(figsize=(8, 8))
sns.barplot(data=hourly_data, x='peak', y='count', ci=None)
ax.set(xlabel='Peak Hour', ylabel='Number of Rentals', title='Rentals across peak hours')
plt.show()

  plt.show()


In [21]:
from scipy import stats
a = hourly_data['peak']
b = hourly_data['count']

stats.pointbiserialr(a,b)

PointbiserialrResult(correlation=0.01831018416071445, pvalue=0.24963439744407673)

## Battery distribution

In [22]:
battery_dist = all_data.copy()

def group_battery_status(df):

    bins= [0,30,50,80,100]
    labels = ['< 30%','30% - 50%','50% - 80%','> 80%']
    battery_dist['battery_status'] = pd.cut(battery_dist['start_battery'], bins=bins, labels=labels, right=False)

    s = battery_dist.battery_status
    counts = s.value_counts()
    percent = s.value_counts(normalize=True)
    percent100 = s.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
    
    return pd.DataFrame({'counts': counts, 'per': percent, 'per100': percent100}, labels)

In [23]:
group_battery_status(battery_dist)

Unnamed: 0,counts,per,per100
< 30%,2160,0.103801,10.4%
30% - 50%,3456,0.166082,16.6%
50% - 80%,8559,0.411312,41.1%
> 80%,6634,0.318804,31.9%


In [24]:
sns.histplot(data=rentals, x='start_battery', kde=True)

<AxesSubplot:title={'center':'Rentals across peak hours'}, xlabel='Peak Hour', ylabel='Number of Rentals'>

In [25]:
print(f"Mean of rentals duration: {round(all_data['duration'].mean(), 2)} minutes")

Mean of rentals duration: 902.58 minutes


## BoxPlot analysis

In [26]:
fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(12, 10)
sns.boxplot(data=hourly_data,y="count",orient="v",ax=axes[0][0])
sns.boxplot(data=hourly_data,y="count",x="season",orient="v",ax=axes[0][1])
sns.boxplot(data=hourly_data,y="count",x="rental_hour",orient="v",ax=axes[1][0])
sns.boxplot(data=hourly_data,y="count",x="working_day",orient="v",ax=axes[1][1])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='Season', ylabel='Count',title="Box Plot On Count Across Seasons")
axes[1][0].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hours Of The Day")
axes[1][1].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count by Working Day")

plt.show()

  plt.show()


## Correlation Matrix

In [27]:
corrMatt = hourly_data[['temp','rain','wdsp','rhum','count']].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
cmap = sns.diverging_palette(220, 20, as_cmap=True)

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corrMatt, mask=mask,vmax=.3, annot=True, ax=ax, cmap=cmap)
plt.show()

  plt.show()


In [28]:
sns.pairplot(hourly_data, 
             x_vars=['temp','rain','wdsp','rhum'],
             dropna=True, 
             y_vars='count', 
             height=8, 
             kind="reg", 
             palette='Set1')
plt.show()

  plt.show()


## Outlier Analysis

In [29]:
hourly_data['count'].describe()

count    3955.000000
mean        5.360556
std         3.854651
min         1.000000
25%         2.000000
50%         5.000000
75%         8.000000
max        26.000000
Name: count, dtype: float64

In [30]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(data=hourly_data,y="count",orient="v",ax=ax)
plt.show()

  plt.show()


In [31]:
print('Outliers (#): {}'.format(round( hourly_data[hourly_data['count'] > 15].shape[0],2)))
print('Outliers (%): {}%'.format(round(( (hourly_data[hourly_data['count'] > 15].shape[0] / hourly_data.shape[0])*100 ),2)))

Outliers (#): 65
Outliers (%): 1.64%


In [32]:
hourlyDataOutliers = hourly_data[hourly_data['count'] > 15]
hourlyDataWithoutOutliers = hourly_data[np.abs(hourly_data["count"]-hourly_data["count"].mean())<=(2*hourly_data["count"].std())] 

In [33]:
hourlyDataOutliers['dayofweek'].value_counts()

Saturday     22
Sunday       15
Friday        7
Tuesday       7
Monday        5
Wednesday     5
Thursday      4
Name: dayofweek, dtype: int64

In [34]:
hourlyDataOutliers['holiday'].value_counts(normalize=True)

False    0.846154
True     0.153846
Name: holiday, dtype: float64

In [35]:
hourlyDataOutliers['working_day'].value_counts(normalize=True)

False    0.646154
True     0.353846
Name: working_day, dtype: float64

In [36]:
hourlyDataOutliers['season'].value_counts()

Spring    27
Winter    26
Summer    12
Name: season, dtype: int64

In [37]:
daily_count = hourly_data.groupby('rental_date')['count'].sum().reset_index()
daily_count_no_outliers = hourlyDataWithoutOutliers.groupby('rental_date')['count'].sum().reset_index()

In [38]:
daily_count.describe()

Unnamed: 0,count
count,212.0
mean,100.004717
std,27.909382
min,38.0
25%,78.0
50%,100.0
75%,120.0
max,171.0


In [39]:
print('Mean of daily rentals = {}'.format(round(daily_count['count'].mean(),2)))
print('Mean of daily rentals without Outliers = {}'.format(round(daily_count_no_outliers['count'].mean(),2)))

Mean of daily rentals = 100.0
Mean of daily rentals without Outliers = 89.08


In [40]:
hourlyDataOutliers = hourlyDataOutliers.join(daily_count.set_index('rental_date'), on='rental_date', lsuffix='_hour', rsuffix='_day')

In [41]:
hourlyDataOutliers[hourlyDataOutliers['count_day'] > 110].sort_values(by='count_day', ascending=False)

Unnamed: 0,rental_date,rental_hour,rental_day,rental_month,rental_year,holiday,dayofweek_n,dayofweek,working_day,season,peak,timesofday,rain,temp,rhum,wdsp,count_hour,count_day
772,2021-03-17,16,17,3,2021,True,2,Wednesday,False,Winter,False,Afternoon,0.0,12.6,48.0,10,16,171
769,2021-03-17,13,17,3,2021,True,2,Wednesday,False,Winter,False,Afternoon,0.0,11.4,66.0,11,24,171
770,2021-03-17,14,17,3,2021,True,2,Wednesday,False,Winter,False,Afternoon,0.0,11.9,60.0,12,17,171
767,2021-03-17,11,17,3,2021,True,2,Wednesday,False,Winter,False,Morning,0.0,8.7,81.0,12,18,171
768,2021-03-17,12,17,3,2021,True,2,Wednesday,False,Winter,False,Afternoon,0.0,10.3,71.0,9,19,171
1476,2021-04-25,11,25,4,2021,False,6,Sunday,False,Spring,False,Morning,0.0,10.4,65.0,13,19,168
1477,2021-04-25,12,25,4,2021,False,6,Sunday,False,Spring,False,Afternoon,0.0,10.3,64.0,14,19,168
1478,2021-04-25,13,25,4,2021,False,6,Sunday,False,Spring,False,Afternoon,0.0,10.8,60.0,14,18,168
1481,2021-04-25,16,25,4,2021,False,6,Sunday,False,Spring,False,Afternoon,0.0,10.1,67.0,11,19,168
1459,2021-04-24,14,24,4,2021,False,5,Saturday,False,Spring,False,Afternoon,0.0,14.2,50.0,12,19,163


In [42]:
hourlyDataOutliers.to_csv('outliers.csv')

In [43]:
hourly_data[hourly_data['count'] > 20]

Unnamed: 0,rental_date,rental_hour,rental_day,rental_month,rental_year,holiday,dayofweek_n,dayofweek,working_day,season,peak,timesofday,rain,temp,rhum,wdsp,count
769,2021-03-17,13,17,3,2021,True,2,Wednesday,False,Winter,False,Afternoon,0.0,11.4,66.0,11,24
806,2021-03-19,12,19,3,2021,False,4,Friday,True,Winter,False,Afternoon,0.0,10.2,80.0,4,24
1077,2021-04-03,11,3,4,2021,False,5,Saturday,False,Spring,False,Morning,0.0,10.1,65.0,7,23
1331,2021-04-17,13,17,4,2021,False,5,Saturday,False,Spring,False,Afternoon,0.0,11.3,56.0,14,26
2570,2021-06-22,13,22,6,2021,False,1,Tuesday,True,Summer,False,Afternoon,0.0,16.9,44.0,7,21


In [44]:
hourlyDataWithoutOutliers = hourly_data[np.abs(hourly_data["count"]-hourly_data["count"].mean())<=(2*hourly_data["count"].std())] 
sns.pairplot(hourlyDataWithoutOutliers, 
             x_vars=['temp','rain','wdsp','rhum'],
             dropna=True, 
             y_vars='count', height=8, aspect=0.8, kind="reg", hue='season')
plt.show()

  plt.show()


## Distribution of numerical features

In [45]:
fig = plt.figure(figsize=(22,6))
gs = fig.add_gridspec(1, 4)
ax0 = fig.add_subplot(gs[0, 0])
ax1 = fig.add_subplot(gs[0, 1])
ax2 = fig.add_subplot(gs[0, 2])
ax3 = fig.add_subplot(gs[0, 3])

sns.histplot(hourly_data['temp'],ax=ax0, stat='density', kde=True)
sns.histplot(hourly_data['rain'],ax=ax1, stat='density', kde=True)
sns.histplot(hourly_data['wdsp'],ax=ax2, stat='density', kde=True)
sns.histplot(hourly_data['rhum'],ax=ax3, stat='density', kde=True)

ax0.set(xlabel='Temperature',title="Distribution - Temperature")
ax1.set(xlabel='Rain',title="Distribution - Rain")
ax2.set(xlabel='Wind Speed',title="Distribution - Wind Speed")
ax3.set(xlabel='Relative Humidity',title="Distribution - Relative Humidity")

plt.show()

  plt.show()


<img title="GitHub Mark" src="./img/GitHub-Mark-64px.png" style="height: 32px; padding-right: 15px" alt="GitHub Mark" align="left"> [GitHub repository](https://github.com/pessini/moby-bikes) <br>Author: Leandro Pessini