# Data Cleaning

In [104]:
import yaml

import pandas as pd

In [105]:
with open("../config/config.yaml", "r") as file:
    config = yaml.safe_load(file)
config

{'raw_data_paths': {'daily': '../data/raw/CBS_2021-2023_Daily_Weather.csv',
  'hourly': '../data/raw/CBS_2021-2023_Hourly_Weather.csv',
  'full': '../data/raw/CBS_2021-2023_Full.csv'},
 'processed_data_paths': {'daily': '../data/processed/daily_data.parquet',
  'hourly': '../data/processed/hourly_data.parquet'}}

# Load

In [106]:
data_hourly = pd.read_csv(config['raw_data_paths']['hourly'])

In [107]:
data_hourly.head()

Unnamed: 0,Casual,Date,Hour,Member,Total_rides,relativehumidity_2m (%),temperature_2m (°C),weathercode (wmo code),windspeed_10m (km/h)
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9


In [108]:
# import csv

# weathercodes = pd.read_csv('../data/raw/WMO2011h.csv', sep=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC, on_bad_lines='warn')
# weathercodes

# Data Cleaning

## Combine Date and Hour Data

In [109]:
data_hourly.Date = pd.to_datetime(data_hourly.Date)
data_hourly['Datetime'] = [x + np.timedelta64(h, 'h') for x,h in zip(data_hourly.Date.values, data_hourly.Hour)]
data_hourly = data_hourly.drop(columns=['Date', 'Hour'])
data_hourly.Datetime = data_hourly.Datetime.values.astype('datetime64[h]')

In [110]:
data_hourly.columns

Index(['Casual', 'Member', 'Total_rides', 'relativehumidity_2m (%)',
       'temperature_2m (°C)', 'weathercode (wmo code)', 'windspeed_10m (km/h)',
       'Datetime'],
      dtype='object')

## Rename Cols

In [111]:
rename_col = {
    'relativehumidity_2m (%)'   : 'relativehumidity_2m',
    'temperature_2m (°C)'       : 'temperature_2m',
    'weathercode (wmo code)'    : 'wmo_code',
    'windspeed_10m (km/h)'      : 'windspeed_10m'
    }
data_hourly = data_hourly.rename(columns=rename_col)
data_hourly = data_hourly.rename(columns=lambda x: x.lower())

# Info, Head, Describe 

In [112]:
data_hourly.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype        
---  ------               --------------  -----        
 0   casual               26269 non-null  float64      
 1   member               26269 non-null  float64      
 2   total_rides          26269 non-null  float64      
 3   relativehumidity_2m  26280 non-null  int64        
 4   temperature_2m       26280 non-null  float64      
 5   wmo_code             26280 non-null  int64        
 6   windspeed_10m        26280 non-null  float64      
 7   datetime             26280 non-null  datetime64[s]
dtypes: datetime64[s](1), float64(5), int64(2)
memory usage: 1.6 MB


In [113]:
data_hourly.head()

Unnamed: 0,casual,member,total_rides,relativehumidity_2m,temperature_2m,wmo_code,windspeed_10m,datetime
0,20.0,26.0,46.0,90,2.4,1,10.2,2021-01-01 00:00:00
1,23.0,23.0,46.0,88,1.8,1,10.0,2021-01-01 01:00:00
2,20.0,28.0,48.0,87,1.1,2,10.1,2021-01-01 02:00:00
3,9.0,7.0,16.0,81,1.6,2,10.6,2021-01-01 03:00:00
4,7.0,5.0,12.0,77,1.6,2,9.9,2021-01-01 04:00:00


In [114]:
data_hourly.describe()

Unnamed: 0,casual,member,total_rides,relativehumidity_2m,temperature_2m,wmo_code,windspeed_10m,datetime
count,26269.0,26269.0,26269.0,26280.0,26280.0,26280.0,26280.0,26280
mean,159.93007,247.165632,407.095702,62.967884,14.412782,7.228311,10.171298,2022-07-02 11:30:00
min,0.0,0.0,1.0,5.0,-14.6,0.0,0.0,2021-01-01 00:00:00
25%,28.0,55.0,85.0,46.0,6.1,0.0,6.5,2021-10-01 17:45:00
50%,105.0,203.0,319.0,64.0,14.8,1.0,9.2,2022-07-02 11:30:00
75%,237.0,371.0,616.0,82.0,22.7,2.0,12.9,2023-04-02 05:15:00
max,1206.0,1534.0,2262.0,100.0,41.5,75.0,40.1,2023-12-31 23:00:00
std,167.330426,225.283539,373.005409,22.72432,10.285751,17.630961,5.261033,


# Export / Variable sharing

In [115]:
%store data_hourly

Stored 'data_hourly' (DataFrame)


In [116]:
data_hourly.to_parquet(config['processed_data_paths']['hourly'])