# Hong Kong Weather data Preparation & Data Quality check

Here I aggregate 38 datasets coming from different observatory for daily Temperature, Humidity and Rainfall data. For each dataset, I check the number and the percentage of invalid observations (value of \*\*\*).

After aggregating, a global dataset by the information is created, so that we have **three global datasets** in the end.

In [None]:
# Necessary modules
import pandas as pd
import time
import datetime
import os
import glob
import numpy as np

## (1) Hong Kong Temperature Dataset

In [None]:
list_files = glob.glob("../0_Data/Temperature/*.csv")
Data_by_dist = "../0_Data/Temperature/CLMMAXT_HKO_.csv"

Max_region=pd.read_csv(Data_by_dist, 
                sep=",", skiprows=3, header = None)

# LMMAXT_HKP_: Full Data before any filtering
LMMAXT_HKP_ = pd.DataFrame(Max_region)
LMMAXT_HKP_.columns = ['year', 'month', 'day', 'value', 'Flag']

# Data Quality indicators:
Obs_summary = pd.DataFrame({'Dist': [Data_by_dist.split("Data/Temperature/")[1].split("_")[1]],
                            'NObs_Dist': [LMMAXT_HKP_.shape[0]],
                            'NObs_Flagged': [LMMAXT_HKP_.loc[LMMAXT_HKP_['Flag']=="#"].shape[0]],
                            'NObs_Starred': [LMMAXT_HKP_.loc[LMMAXT_HKP_['value']=="***"].shape[0]],
                            'NObs_Str_Shrp': [LMMAXT_HKP_.loc[(LMMAXT_HKP_['value']=="***")&(LMMAXT_HKP_['Flag']=="#")].shape[0]],
                            
                            })

Obs_summary['Purge_Pct(%)'] = round(100*Obs_summary['NObs_Starred']/Obs_summary['NObs_Dist'],2)
Data_summary = pd.concat([Data_summary, Obs_summary], axis = 0)

## Data Filter ##
# LMMAXT_HKP_2: Filtered Data
# F1: Delete the last 3 lines:
LMMAXT_HKP_2 = LMMAXT_HKP_.iloc[:-3]
LMMAXT_HKP_2.reset_index(inplace=True, drop=True) 
LMMAXT_HKP_2.loc[:,'month'] = LMMAXT_HKP_2['month'].values.astype(int)
LMMAXT_HKP_2.loc[:,'day'] = LMMAXT_HKP_2['day'].values.astype(int)
LMMAXT_HKP_2 = LMMAXT_HKP_2.loc[LMMAXT_HKP_2['value']!="***"]
LMMAXT_HKP_2.reset_index(inplace=True, drop=True) 
LMMAXT_HKP_2.loc[:,'value'] = LMMAXT_HKP_2['value'].values.astype(float)


# F2: Delete data of 2007 since incomplete (only from october)
LMMAXT_HKP_2 = LMMAXT_HKP_2.loc[LMMAXT_HKP_2['year']!='2007']
# Date parsing 
LMMAXT_HKP_date = pd.to_datetime(LMMAXT_HKP_2[['year','month', 'day']])
#LMMAXT_HKP_date = LMMAXT_HKP_date.dt.dayofyear



Unnamed: 0,Dist,NObs_Dist,NObs_Flagged,NObs_Starred,NObs_Str_Shrp,Purge_Pct(%),StartDate,EndDate
0,HKO,49006,0,1,0,0.0,1884-01-01,2025-02-28


In [75]:
list_files = glob.glob("../0_Data/Temperature/*.csv")
Data_by_dist = "../0_Data/Temperature/CLMMAXT_HKO_.csv"

Max_region=pd.read_csv(Data_by_dist, 
                sep=",", skiprows=3, header = None)

# LMMAXT_HKP_: Full Data before any filtering
LMMAXT_HKP_ = pd.DataFrame(Max_region)
LMMAXT_HKP_.columns = ['year', 'month', 'day', 'value', 'Flag']
LMMAXT_HKP_.iloc[5903]

year     1900
month     2.0
day      29.0
value     ***
Flag      NaN
Name: 5903, dtype: object

In [88]:
# Public Data Downloaded from HK Data Gov website
# CSV File list in 0_Data directory:
list_files = glob.glob("../0_Data/Temperature/*.csv")

# Aggregated Data templete:
Agg_Data = pd.DataFrame()
Data_summary = pd.DataFrame()

for Data_by_dist in list_files:
    
    Max_region=pd.read_csv(Data_by_dist, 
                       sep=",", skiprows=3, header = None)

    # LMMAXT_HKP_: Full Data before any filtering
    LMMAXT_HKP_ = pd.DataFrame(Max_region)
    LMMAXT_HKP_.columns = ['year', 'month', 'day', 'value', 'Flag']
    
    # Data Quality indicators:
    Obs_summary = pd.DataFrame({'Dist': [Data_by_dist.split("Data/Temperature/")[1].split("_")[1]],
                                'NObs_Dist': [LMMAXT_HKP_.shape[0]],
                                'NObs_Flagged': [LMMAXT_HKP_.loc[LMMAXT_HKP_['Flag']=="#"].shape[0]],
                                'NObs_Starred': [LMMAXT_HKP_.loc[LMMAXT_HKP_['value']=="***"].shape[0]],
                                'NObs_Str_Shrp': [LMMAXT_HKP_.loc[(LMMAXT_HKP_['value']=="***")&(LMMAXT_HKP_['Flag']=="#")].shape[0]],                          
                                })
   
    Obs_summary['Purge_Pct(%)'] = round(100*Obs_summary['NObs_Starred']/Obs_summary['NObs_Dist'],2)
        
    ## Data Filter ##
    # LMMAXT_HKP_2: Filtered Data
    # F1: Delete the last 3 lines:
    LMMAXT_HKP_2 = LMMAXT_HKP_.iloc[:-3]
    LMMAXT_HKP_2.reset_index(inplace=True, drop=True) 
    LMMAXT_HKP_2.loc[:,'month'] = LMMAXT_HKP_2['month'].values.astype(int)
    LMMAXT_HKP_2.loc[:,'day'] = LMMAXT_HKP_2['day'].values.astype(int)
    LMMAXT_HKP_2 = LMMAXT_HKP_2.loc[LMMAXT_HKP_2['value']!="***"]
    LMMAXT_HKP_2.reset_index(inplace=True, drop=True) 
    LMMAXT_HKP_2.loc[:,'value'] = LMMAXT_HKP_2['value'].values.astype(float)

    # F2: Delete data of 2007 since incomplete (only from october)
    LMMAXT_HKP_2 = LMMAXT_HKP_2.loc[LMMAXT_HKP_2['year']!='2007']

    # Date parsing 
    LMMAXT_HKP_date = pd.to_datetime(LMMAXT_HKP_2[['year','month', 'day']])
    #LMMAXT_HKP_date = LMMAXT_HKP_date.dt.dayofyear
    
    # Concatenate parsed date + Table 
    # LMMAXT_HKP_3: Concatenated Data
    LMMAXT_HKP_3 = pd.concat([LMMAXT_HKP_date, LMMAXT_HKP_2[['year','value', 'Flag']]],
            axis = 1)
    LMMAXT_HKP_3 = LMMAXT_HKP_3.rename(columns = {0: "Date"})
    
    # Information on the observed period by Observatory:
    Obs_summary['StartDate'] = LMMAXT_HKP_date.min()
    Obs_summary['EndDate'] = LMMAXT_HKP_date.max()
    
    Data_summary = pd.concat([Data_summary, Obs_summary], axis = 0)
    
    Agg_Data = pd.concat([Agg_Data, LMMAXT_HKP_3], axis = 0, ignore_index=True)

Check if all the lines are completed, computing the percentage of null values.

In [89]:
round(100*Agg_Data.loc[Agg_Data['value'].isna()!=False].shape[0]/Agg_Data.shape[0],1)

0.0

Total number of observations (From all observatories):

In [90]:
total_obs = Data_summary['NObs_Dist'].sum()
print(total_obs)

362334


Distribution by observatories:

In [123]:
Data_Composition = pd.DataFrame({
    'Observatory':Data_summary['Dist'].values,
    'NObs':Data_summary['NObs_Dist'].values,
    'Percentage(%)To_TotalOBS':round((Data_summary['NObs_Dist']/total_obs)*100,2),
    'ObservedPeriod(#Year)':[np.ceil(td/np.timedelta64(1, 'D')).astype(int) for td in (Data_summary['EndDate']-Data_summary['StartDate'])/(30*12)] 
})

Data_Composition.sort_values(by=['Percentage(%)To_TotalOBS','ObservedPeriod(#Year)'], 
                             ascending = False,
                             inplace = True)
Data_Composition.reset_index(inplace=True, drop = True)
print(Data_Composition.head())
print(Data_Composition.tail())

  Observatory   NObs  Percentage(%)To_TotalOBS  ObservedPeriod(#Year)
0         HKO  49006                     13.53                    144
1         SHA  14764                      4.07                     41
2         LFS  14399                      3.97                     40
3         TKL  13425                      3.71                     38
4         HKS  12999                      3.59                     37
   Observatory  NObs  Percentage(%)To_TotalOBS  ObservedPeriod(#Year)
33         TY1  5298                      1.46                     15
34          TW  5201                      1.44                     15
35         SE1  3730                      1.03                     11
36         YLP  3637                      1.00                     11
37         CWB  2285                      0.63                      7


In [120]:
Data_Composition['ObservedPeriod(#Year)'].mean()

np.float64(27.157894736842106)

We see that the observatory which logged Temperature data for the longest period in Hong Kong is the one in Hong Kong park (for 144 years). It makes about 14% of the total dataset. The shortest observated period is given by Clear Water Bay Observatory (CWB, for 7 years). The mean value of the observed period over 38 observatories is 27 years.

In [18]:
print(Data_summary.sort_values(by='Purge_Pct(%)', ascending=False).head())
print("...")
print(Data_summary.sort_values(by='Purge_Pct(%)', ascending=False).tail())

  Dist  NObs_Dist  NObs_Flagged  NObs_Starred  NObs_Str_Shrp  Purge_Pct(%)
0  NGP       7855          3339           432            432          5.50
0  SEK      10320          3391           439            439          4.25
0  WGL      12968          5181           520            520          4.01
0  JKB      12147          3298           412            412          3.39
0  TPO       8710          2861           293            293          3.36
...
  Dist  NObs_Dist  NObs_Flagged  NObs_Starred  NObs_Str_Shrp  Purge_Pct(%)
0  TY1       5298          2322             2              2          0.04
0  SSH       7520          1259             3              3          0.04
0   TW       5201          2339             2              2          0.04
0  WTS       5816          2613             1              1          0.02
0  HKO      49006             0             1              0          0.00


In [None]:


Data_summary

Data_summary.loc[Data_summary['Purge_Pct']<1].shape[0]/Data_summary.shape[0]

treated_data_rep = r'0_Data/wrangled/' 
if not os.path.exists(treated_data_rep):
    os.makedirs(treated_data_rep)
Agg_Data.to_pickle(treated_data_rep+"Agg_Data.pkl")
