# Data Cleaning

In [21]:
import yaml

import pandas as pd

In [38]:
with open("../config/config.yaml", "r") as file:
    config = yaml.safe_load(file)
config

{'raw_data_paths': {'daily': '../data/raw/CBS_2021-2023_Daily_Weather.csv',
  'hourly': '../data/raw/CBS_2021-2023_Hourly_Weather.csv',
  'full': '../data/raw/CBS_2021-2023_Full.csv',
  'wmo_codes': '../data/raw/WMO2011h.csv'},
 'processed_data_paths': {'daily': '../data/processed/daily_data.parquet',
  'hourly': '../data/processed/hourly_data.parquet'}}

# Load

In [39]:
data_daily  = pd.read_csv(config['raw_data_paths']['daily'])

In [40]:
import csv

weathercodes = pd.read_csv(config['raw_data_paths']['wmo_codes'], sep=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC, on_bad_lines='warn')
weathercodes

Skipping line 25: expected 2 fields, saw 3
Skipping line 26: expected 2 fields, saw 3

  weathercodes = pd.read_csv(config['raw_data_paths']['wmo_codes'], sep=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC, on_bad_lines='warn')


Unnamed: 0,WMO Code,Description
0,0.0,"""Cloud development not observed or not observ..."
1,1.0,"""Cloud generally dissolving or becoming less ..."
2,2.0,"""State of sky on the whole unchanged"""
3,3.0,"""Clouds generally forming or developing"""
4,4.0,"""Visibility reduced by smoke, e.g. veldt or f..."
...,...,...
91,95.0,"""Thunderstorm, slight or moderate, without ha..."
92,96.0,"""Thunderstorm, slight or moderate, with hail ..."
93,97.0,"""Thunderstorm, heavy, without hail, but with ..."
94,98.0,"""Thunderstorm combined with dust/sandstorm at..."


In [25]:
data_daily .head()

Unnamed: 0,Casual,Date,Member,Total_rides,apparent_temperature_mean (°C),temperature_2m_mean (°C),weathercode (wmo code),windspeed_10m_max (km/h)
0,371,2021-01-01,590,961,-0.9,2.3,63,10.6
1,2874,2021-01-02,2844,5718,4.8,7.6,61,19.1
2,495,2021-01-03,1133,1628,0.9,3.9,63,13.0
3,907,2021-01-04,2074,2981,1.0,4.1,3,13.3
4,1057,2021-01-05,2004,3061,1.4,4.0,51,9.2


# Data Cleaning

In [26]:
# data_daily.Date = pd.to_datetime(data_daily.Date)
data_daily.Date = data_daily.Date.values.astype('datetime64[D]')

In [27]:
rename_col = {
    'apparent_temperature_mean (°C)':'apparent_temperature_mean',
    'temperature_2m_mean (°C)'      :'temperature_2m_mean',
    'weathercode (wmo code)'        :'wmo_code',
    'windspeed_10m_max (km/h)'      :'windspeed_10m_max'
    }
data_daily = data_daily.rename(columns=rename_col)
data_daily = data_daily.rename(columns=lambda x: x.lower())

# Info, Head, Describe 

In [28]:
data_daily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype        
---  ------                     --------------  -----        
 0   casual                     1095 non-null   int64        
 1   date                       1095 non-null   datetime64[s]
 2   member                     1095 non-null   int64        
 3   total_rides                1095 non-null   int64        
 4   apparent_temperature_mean  1095 non-null   float64      
 5   temperature_2m_mean        1095 non-null   float64      
 6   wmo_code                   1095 non-null   int64        
 7   windspeed_10m_max          1095 non-null   float64      
dtypes: datetime64[s](1), float64(3), int64(4)
memory usage: 68.6 KB


In [29]:
data_daily.head()

Unnamed: 0,casual,date,member,total_rides,apparent_temperature_mean,temperature_2m_mean,wmo_code,windspeed_10m_max
0,371,2021-01-01,590,961,-0.9,2.3,63,10.6
1,2874,2021-01-02,2844,5718,4.8,7.6,61,19.1
2,495,2021-01-03,1133,1628,0.9,3.9,63,13.0
3,907,2021-01-04,2074,2981,1.0,4.1,3,13.3
4,1057,2021-01-05,2004,3061,1.4,4.0,51,9.2


In [30]:
data_daily.describe()

Unnamed: 0,casual,date,member,total_rides,apparent_temperature_mean,temperature_2m_mean,wmo_code,windspeed_10m_max
count,1095.0,1095,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0
mean,3836.715068,2022-07-02 00:00:00,5929.492237,9766.207306,12.836438,14.412785,24.391781,16.05105
min,174.0,2021-01-01 00:00:00,323.0,497.0,-15.2,-9.1,0.0,5.6
25%,2122.0,2021-10-01 12:00:00,4292.5,6741.5,2.7,6.25,1.0,11.7
50%,3595.0,2022-07-02 00:00:00,5848.0,9759.0,13.1,14.9,3.0,14.7
75%,5321.0,2023-04-01 12:00:00,7577.0,13020.0,23.4,22.75,53.0,19.4
max,12839.0,2023-12-31 00:00:00,12580.0,20174.0,34.7,34.7,75.0,40.1
std,2114.895305,,2419.273171,4129.630135,11.88942,9.667675,27.77444,5.880358


# Checks

In [37]:
# casual rides + member rides == total rides
check_rides = data_daily.casual + data_daily.member != data_daily.total_rides
sum(check_rides)

0

# Export / Variable sharing

In [31]:
%store data_daily

Stored 'data_daily' (DataFrame)


In [32]:
data_daily.to_parquet(config['processed_data_paths']['daily'])