In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from datetime import datetime

In [2]:
data_path = './PM2.5/'
year_scope = [2017, 2018, 2019, 2020, 2021, 2022]
os.listdir(data_path)
data_name_lst = [name for name in os.listdir(data_path) if name.startswith('PM2.5')]

In [3]:
selected_data_name = []
for i in year_scope:
    for j in data_name_lst:
        if str(i) in j:
            selected_data_name.append(j)

## Helping Functions

In [6]:
def load_PM_data(data_path):
    PM_data = {}
    data_name_lst = [name for name in os.listdir(data_path) if name.startswith('PM2.5')]
    for name in data_name_lst:
        a_df = pd.read_excel(os.path.join(data_path, name), sheet_name='PM2.5')
        PM_data[name.split('_')[1]] = a_df
    return PM_data

def load_selected_PM_data(data_path, selected_data_name):
    PM_data = {}
    data_name_lst = selected_data_name
    for name in data_name_lst:
        a_df = pd.read_excel(os.path.join(data_path, name), sheet_name='PM2.5')
        PM_data[name.split('_')[1]] = a_df
    return PM_data

In [7]:
def check_station(ref_station, variable_station):
    ref_columns = [1]*len(ref_station)
    for idx, station_name in enumerate(ref_station):
        if station_name not in variable_station:
            ref_columns[idx] = 0
    return ref_columns

def made_locate_station(ref_name, var_name_lst, data_path):
    locate_lst = []
    ref_station= pd.read_excel(os.path.join(data_path, ref_name), sheet_name='station_detail')['รหัสสถานี'].to_list()
    for idx in range(len(var_name_lst)):
        variable_station = pd.read_excel(os.path.join(data_path, var_name_lst[idx]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        locate_lst.append(check_station(ref_station, variable_station))
    return np.array(locate_lst), ref_station

## Working Space

In [4]:
selected_data_name

['PM2.5_2017_.xlsx',
 'PM2.5_2018_.xlsx',
 'PM2.5_2019_.xlsx',
 'PM2.5_2020_.xlsx',
 'PM2.5_2021_.xlsx',
 'PM2.5_2022_.xlsx']

### Observing on year data

In [11]:
locate_lst, ref_station = made_locate_station(selected_data_name[-1], selected_data_name, data_path)

In [12]:
df = pd.DataFrame(locate_lst, columns=ref_station)
df_year = pd.DataFrame(year_scope, columns=['years'])
df = df_year.join(df)
df = df.loc[:,~df.columns.duplicated()].copy()
df

Unnamed: 0,years,02T,05T,10T,11T,12T,59T,61T,03T,50T,...,98T,42T,43T,44T,62T,63T,78T,80T,89T,93T
0,2017,0,1,0,0,0,1,1,0,1,...,0,0,0,1,1,1,0,1,0,0
1,2018,0,1,1,1,0,1,1,1,1,...,0,1,0,1,1,1,0,1,0,0
2,2019,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,0,0
3,2020,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,0,0
4,2021,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
5,2022,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Observing on the provide

In [49]:
ref_name = selected_data_name[5]
print(ref_name)

PM2.5_2022_.xlsx


In [50]:
ref_station = pd.read_excel(os.path.join(data_path, ref_name), sheet_name='station_detail')
ref_station.head()

Unnamed: 0,รหัสสถานี,ชื่อสถานี,รายละเอียดจุดติดตั้งสถานี
0,02T,แขวงหิรัญรูจี เขตธนบุรี กทม.,มหาวิทยาลัยราชภัฏบ้านสมเด็จเจ้าพระยา
1,05T,แขวงบางนา เขตบางนา กทม.,กรมอุตุนิยมวิทยา
2,10T,แขวงคลองจั่น เขตบางกะปิ กทม.,เคหะชุมชนคลองจั่น
3,11T,แขวงดินแดง เขตดินแดง กทม.,สนามกีฬาเคหะชุมชนห้วยขวาง
4,12T,แขวงช่องนนทรี เขตยานนาวา กทม.,โรงเรียนนนทรีวิทยา


In [84]:
def made_provide_station(selected_data_name, data_path):
    df = pd.DataFrame(columns=['station_id', 'station_name'])
    for file_name in selected_data_name:
        station_detail = pd.read_excel(os.path.join(data_path, file_name), sheet_name='station_detail')
        select_detail = station_detail.copy()[['รหัสสถานี', 'ชื่อสถานี']].rename(columns={'รหัสสถานี':'station_id', 'ชื่อสถานี':'station_name'})
        df = pd.concat([df, select_detail], axis=0)
        df['station_name'] = df['station_name'].apply(lambda x: x.split(' ')[-1])
    df = df.drop_duplicates(subset=['station_id']).reset_index().drop(columns=['index'])
    return df

In [85]:
provide_station_df = made_provide_station(selected_data_name, data_path)

In [96]:
provide_uqe = np.unique(provide_station_df['station_name'].to_numpy())
label_uqe = np.arange(provide_uqe.shape[0])
provide_dict = {provide_uqe[i]:label_uqe[i] for i in range(label_uqe.shape[0])}

In [97]:
provide_station_df['provide_label'] = provide_station_df['station_name'].apply(lambda x: provide_dict[x])

In [109]:
provide_station_df.sort_values(['provide_label'])

In [108]:
# provide_station_df.to_csv('./Clean_Data/check_station_per_provide.csv', index=False, encoding='UTF-8')