# Data Cleaning

In [45]:
import yaml

import pandas as pd
import numpy as np

In [46]:
with open("../config/config.yaml", "r") as file:
    config = yaml.safe_load(file)
config

{'raw_data_paths': {'daily': '../data/raw/CBS_2021-2023_Daily_Weather.csv',
  'hourly': '../data/raw/CBS_2021-2023_Hourly_Weather.csv',
  'full': '../data/raw/CBS_2021-2023_Full.csv'},
 'processed_data_paths': {'daily': '../data/processed/daily_data.parquet',
  'hourly': '../data/processed/hourly_data.parquet'}}

# Load

In [47]:
data_hourly = pd.read_csv(config['raw_data_paths']['hourly'])

In [48]:
data_hourly.head()

Unnamed: 0,Casual,Date,Hour,Member,Total_rides,relativehumidity_2m (%),temperature_2m (°C),weathercode (wmo code),windspeed_10m (km/h)
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9


In [49]:
# import csv

# weathercodes = pd.read_csv('../data/raw/WMO2011h.csv', sep=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC, on_bad_lines='warn')
# weathercodes

# Data Cleaning

## Combine Date and Hour Data

In [50]:
data_hourly.Date = pd.to_datetime(data_hourly.Date)
data_hourly['Datetime'] = [x + np.timedelta64(h, 'h') for x,h in zip(data_hourly.Date.values, data_hourly.Hour)]
data_hourly = data_hourly.drop(columns=['Date', 'Hour'])
data_hourly.Datetime = data_hourly.Datetime.values.astype('datetime64[h]')

In [51]:
data_hourly.columns

Index(['Casual', 'Member', 'Total_rides', 'relativehumidity_2m (%)',
       'temperature_2m (°C)', 'weathercode (wmo code)', 'windspeed_10m (km/h)',
       'Datetime'],
      dtype='object')

## Rename Cols

In [52]:
rename_col = {
    'relativehumidity_2m (%)'   : 'relativehumidity_2m',
    'temperature_2m (°C)'       : 'temperature_2m',
    'weathercode (wmo code)'    : 'wmo_code',
    'windspeed_10m (km/h)'      : 'windspeed_10m'
    }
data_hourly = data_hourly.rename(columns=rename_col)
data_hourly = data_hourly.rename(columns=lambda x: x.lower())

## Missing values

In [54]:
# analyse missing values
print(data_hourly.isna().sum())
mask_any_na_values = data_hourly.isna().T.any().T
data_hourly[mask_any_na_values]

casual                 11
member                 11
total_rides            11
relativehumidity_2m     0
temperature_2m          0
wmo_code                0
windspeed_10m           0
datetime                0
dtype: int64


Unnamed: 0,casual,member,total_rides,relativehumidity_2m,temperature_2m,wmo_code,windspeed_10m,datetime
21853,,,,73,-3.1,0,5.1,2021-01-21 03:00:00
21854,,,,71,0.4,0,11.2,2021-01-22 03:00:00
21855,,,,86,0.3,71,19.8,2021-02-02 03:00:00
21856,,,,71,-1.2,3,20.6,2021-02-03 02:00:00
21857,,,,82,-4.0,1,7.8,2021-02-09 01:00:00
21858,,,,94,-1.1,71,10.9,2021-02-19 02:00:00
21859,,,,93,-1.6,51,10.4,2021-02-19 03:00:00
21860,,,,56,2.9,0,13.9,2021-03-14 02:00:00
21861,,,,77,-8.7,0,10.1,2022-01-04 02:00:00
21862,,,,50,-5.1,0,21.1,2022-03-13 02:00:00


In [56]:
data_hourly[mask_any_na_values].datetime.value_counts()

datetime
2021-01-21 03:00:00    1
2021-01-22 03:00:00    1
2021-02-02 03:00:00    1
2021-02-03 02:00:00    1
2021-02-09 01:00:00    1
2021-02-19 02:00:00    1
2021-02-19 03:00:00    1
2021-03-14 02:00:00    1
2022-01-04 02:00:00    1
2022-03-13 02:00:00    1
2023-03-12 02:00:00    1
Name: count, dtype: int64

In [37]:
# drop missing values
data_hourly = data_hourly.dropna(subset='total_rides', axis=0)

## Fix Data Types

In [39]:
data_hourly.casual = data_hourly.casual.astype(int)
data_hourly.member = data_hourly.member.astype(int)
data_hourly.total_rides = data_hourly.total_rides.astype(int)

# Info, Head, Describe 

In [40]:
data_hourly.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26269 entries, 0 to 26279
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype        
---  ------               --------------  -----        
 0   casual               26269 non-null  int64        
 1   member               26269 non-null  int64        
 2   total_rides          26269 non-null  int64        
 3   relativehumidity_2m  26269 non-null  int64        
 4   temperature_2m       26269 non-null  float64      
 5   wmo_code             26269 non-null  int64        
 6   windspeed_10m        26269 non-null  float64      
 7   datetime             26269 non-null  datetime64[s]
dtypes: datetime64[s](1), float64(2), int64(5)
memory usage: 1.8 MB


In [43]:
data_hourly.head()

Unnamed: 0,casual,member,total_rides,relativehumidity_2m,temperature_2m,wmo_code,windspeed_10m,datetime
0,20,26,46,90,2.4,1,10.2,2021-01-01 00:00:00
1,23,23,46,88,1.8,1,10.0,2021-01-01 01:00:00
2,20,28,48,87,1.1,2,10.1,2021-01-01 02:00:00
3,9,7,16,81,1.6,2,10.6,2021-01-01 03:00:00
4,7,5,12,77,1.6,2,9.9,2021-01-01 04:00:00


In [42]:
data_hourly.describe()

Unnamed: 0,casual,member,total_rides,relativehumidity_2m,temperature_2m,wmo_code,windspeed_10m,datetime
count,26269.0,26269.0,26269.0,26269.0,26269.0,26269.0,26269.0,26269
mean,159.93007,247.165632,407.095702,62.962846,14.41959,7.223838,10.17033,2022-07-02 15:14:52
min,0.0,0.0,1.0,5.0,-14.6,0.0,0.0,2021-01-01 00:00:00
25%,28.0,55.0,85.0,46.0,6.2,0.0,6.5,2021-10-01 23:00:00
50%,105.0,203.0,319.0,64.0,14.8,1.0,9.2,2022-07-02 16:00:00
75%,237.0,371.0,616.0,82.0,22.7,2.0,12.9,2023-04-02 08:00:00
max,1206.0,1534.0,2262.0,100.0,41.5,75.0,40.1,2023-12-31 23:00:00
std,167.330426,225.283539,373.005409,22.72616,10.282328,17.623401,5.260753,


# Export / Variable sharing

In [44]:
%store data_hourly

Stored 'data_hourly' (DataFrame)


In [None]:
data_hourly.to_parquet(config['processed_data_paths']['hourly'])