In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import os

### Load Data

In [2]:
def load_PM_data(data_path):
    PM_data = {}
    data_name_lst = [name for name in os.listdir(data_path) if name.startswith('PM2.5')]
    for name in data_name_lst:
        a_df = pd.read_excel(os.path.join(data_path, name), sheet_name='PM2.5')
        PM_data[name.split('_')[1]] = a_df
    return PM_data

def load_selected_PM_data(data_path, selected_data_name):
    PM_data = {}
    data_name_lst = selected_data_name
    for name in data_name_lst:
        a_df = pd.read_excel(os.path.join(data_path, name), sheet_name='PM2.5')
        PM_data[name.split('_')[1]] = a_df
    return PM_data

### Observe the data

In [3]:
def check_new_station(name_lst, data_path, selected_data_name):
    updates = []
    for idx in range(len(name_lst)-1):
        a_update = []
        df0 = pd.read_excel(os.path.join(data_path, selected_data_name[idx]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        df1 = pd.read_excel(os.path.join(data_path, selected_data_name[idx+1]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        for id_station in df1:
            if id_station not in df0:
                a_update.append(id_station)
        updates.append(a_update)
    return updates

def check_station(ref_station, variable_station):
    ref_columns = [1]*len(ref_station)
    for idx, station_name in enumerate(ref_station):
        if station_name not in variable_station:
            ref_columns[idx] = 0
    return ref_columns

def Check_for_NonMissing(PM_data, print_progress=True):
    check_dict = {}
    num_dict = {}
    for year, df in PM_data.items():
        num_nan = np.sum(df.isna().sum())
        if num_nan == 0:
            check_dict[year] = True
            num_dict[year] = num_nan
        else:
            check_dict[year] = False
            num_dict[year] = num_nan
    if print_progress:
        for year, state in check_dict.items():
            if state:
                print(year, 'Num:', num_dict[year], 'Pass')
            else:
                print(year, 'Num:', num_dict[year], 'Failed')
    return check_dict


In [4]:
def made_locate_station(ref_name, var_name_lst, data_path):
    locate_lst = []
    ref_station= pd.read_excel(os.path.join(data_path, ref_name), sheet_name='station_detail')['รหัสสถานี'].to_list()
    for idx in range(len(var_name_lst)):
        variable_station = pd.read_excel(os.path.join(data_path, var_name_lst[idx]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        locate_lst.append(check_station(ref_station, variable_station))
    return np.array(locate_lst), ref_station

def made_provide_station(selected_data_name, data_path):
    df = pd.DataFrame(columns=['station_id', 'station_name'])
    for file_name in selected_data_name:
        station_detail = pd.read_excel(os.path.join(data_path, file_name), sheet_name='station_detail')
        select_detail = station_detail.copy()[['รหัสสถานี', 'ชื่อสถานี']].rename(columns={'รหัสสถานี':'station_id', 'ชื่อสถานี':'station_name'})
        df = pd.concat([df, select_detail], axis=0)
        df['station_name'] = df['station_name'].apply(lambda x: x.split(' ')[-1])
    df = df.drop_duplicates(subset=['station_id']).reset_index().drop(columns=['index'])
    provide_uqe = np.unique(df['station_name'].to_numpy())
    label_uqe = np.arange(provide_uqe.shape[0])
    provide_dict = {provide_uqe[i]:label_uqe[i] for i in range(label_uqe.shape[0])}
    df['label'] = df['station_name'].apply(lambda x: provide_dict[x])
    return df

def made_sector_station(selected_data_name, data_path, label_dict):
    df = pd.DataFrame(columns=['station_id', 'station_name'])
    for file_name in selected_data_name:
        station_detail = pd.read_excel(os.path.join(data_path, file_name), sheet_name='station_detail')
        select_detail = station_detail.copy()[['รหัสสถานี', 'ชื่อสถานี']].rename(columns={'รหัสสถานี':'station_id', 'ชื่อสถานี':'station_name'})
        df = pd.concat([df, select_detail], axis=0)
        df['station_name'] = df['station_name'].apply(lambda x: x.split(' ')[-1])
    df = df.drop_duplicates(subset=['station_id']).reset_index().drop(columns=['index'])
    df['label'] = df['station_name'].apply(lambda x: label_dict[x])
    return df

### The sectors' label

In [5]:
def meteorology_labeling():
    NE = ' อำนาจเจริญ, บึงกาฬ, บุรีรัมย์, ชัยภูมิ, กาฬสินธุ์, ขอนแก่น, เลย, มหาสารคาม, มุกดาหาร, นครพนม, นครราชสีมา, หนองบัวลำภู, หนองคาย, ร้อยเอ็ด, สกลนคร, ศรีสะเกษ, สุรินทร์, อุบลราชธานี, อุดรธานี, ยโสธร'
    N = ' เชียงใหม่, เชียงราย, ลำปาง, ลำพูน, แม่ฮ่องสอน, น่าน, พะเยา, แพร่, อุตรดิตถ์, ตาก, สุโขทัย, พิษณุโลก, พิจิตร, กำแพงเพชร, เพชรบูรณ์'
    C = ' นครสวรรค์, อุทัยธานี, อ่างทอง, ชัยนาท, พระนครศรีอยุธยา, กทม., ลพบุรี, นครปฐม, นนทบุรี, ปทุมธานี, สมุทรปราการ, สมุทรสาคร, สมุทรสงคราม, สระบุรี, สิงห์บุรี, สุพรรณบุรี, กาญจนบุรี, ราชบุรี'
    E = ' ฉะเชิงเทรา, จันทบุรี, ชลบุรี, ปราจีนบุรี, ระยอง, สระแก้ว, ตราด, นครนายก'
    SW = ' กระบี่, พังงา, ภูเก็ต, ระนอง, สตูล, ตรัง'
    SE = ' เพชรบุรี, ประจวบคีรีขันธ์, ประจวบคิรีขันธ์, ชุมพร, นครศรีธรรมราช, นราธิวาส, ปัตตานี, พัทลุง, สงขลา, สุราษฏร์ธานี, ยะลา'
    Thai_sector = [NE, N, C, E, SE, SW]
    Thai_dict = {f'{j}': ['จ.' + i.split(' ')[-1] if i.split(' ')[-1] != 'กทม.' else i.split(' ')[-1] for i in Thai_sector[j].split(',')] for j in range(len(Thai_sector))}
    swap_Thai_dict = {provide:sector for sector in Thai_dict.keys() for provide in Thai_dict[sector]}
    return swap_Thai_dict

### Fill nan on each station by thier recording

In [6]:
class Fill_NaN_YearRecord:

    def __init__(self, data_path, selected_years):
        self.data_path = data_path
        self.selected_years = selected_years
        files_name_lst = [name for name in os.listdir(self.data_path) if name.startswith('PM2.5')] # Format PM2.5_(year)_.xlsx
        self.selected_files = []
        for y in self.selected_years:
            for f in files_name_lst:
                if str(y) in f:
                    self.selected_files.append(f)
        self.ref_file = self.selected_files[-1]

        # Define Parameters
        self.save_name = 'FillByYears'

        # Pre-Define Function Parameters
        self.PM_data = None
        self.establish_df = None 
        self.station_dict = None
        self.new_PM_data = None

    @staticmethod
    def sort_columns(dct):
        for key, val in dct.items():
            a = dct[key]
            columns_lst = ['Date']
            num_column = []
            for col_name in a.columns:
                try:
                    num_column.append(int(col_name))
                except Exception:
                    continue
            num_column = np.array(sorted(num_column)[::-1]).astype(str)
            a = a[np.append(columns_lst, num_column)]
            dct[key] = a
        return dct
    
    @staticmethod
    def fill_na_all_station(stations_dict):
        new_stations_dict = {}
        for key, df in stations_dict.items():
            if df.shape[1] > 2:
                for idx in range(1, len(df.columns)-1):
                    df_year = df[df.columns[idx]]
                    df_before_year = df[df_year.isna()][df.columns[idx+1:]]
                    fill_value = df_before_year.sum(axis=1) / (~df_before_year.isna()).sum(axis=1)
                    full_df_year = df_year.fillna(fill_value)
                    df[df.columns[idx]] = full_df_year
            new_stations_dict[key] = df
        return new_stations_dict

    def made_establish_df(self):
        locate_lst, ref_station = made_locate_station(self.ref_file, self.selected_files, self.data_path)
        df = pd.DataFrame(locate_lst, columns=ref_station)
        df_year = pd.DataFrame(self.selected_years, columns=['years'])
        df = df_year.join(df)
        df = df.loc[:,~df.columns.duplicated()].copy()
        return df
    
    def match_year_station(self):
        PM_data_work = self.PM_data.copy()
        station_lst = self.establish_df.columns[1:]
        stations_dict = {}
        for idx_station, now_station in enumerate(station_lst):
            year_station = self.establish_df['years'][self.establish_df[now_station] == 1].to_list()
            merge_switch = 0
            for a_year in year_station:
                a_year_df = PM_data_work[f'{a_year}'][[now_station]].rename(columns={now_station:f'{a_year}'})
                a_year_df['Date'] = PM_data_work[f'{a_year}']['Date'].apply(lambda x: datetime.strftime(x, '%m_%d'))
                if merge_switch == 0:
                    station_df = a_year_df
                else:
                    station_df = pd.merge(station_df, a_year_df, on='Date')
                merge_switch += 1
            stations_dict[now_station] = station_df
        return stations_dict
    
    def convert_to_original_yearfill(self):
        new_PM_data = {}
        for year_key, year_df in self.PM_data.items():
            uniform_df = pd.DataFrame({'Date': year_df['Date'].apply(lambda x: datetime.strftime(x, '%m_%d'))})
            for station_key, station_df in self.station_dict.items():
                if year_key in station_df.columns:
                    station_in_year = station_df[['Date', year_key]].rename(columns={year_key:station_key}).drop_duplicates(subset=['Date'])
                    uniform_df = pd.merge(uniform_df, station_in_year, on='Date')
            uniform_df['Date'] = uniform_df['Date'].apply(lambda x: year_key + '_' + x)
            new_PM_data[year_key] = uniform_df
        return new_PM_data
    
    def save_to_csv(self, save_path):
        for key, val in self.new_PM_data.items():
            val.to_csv(os.path.join(save_path, f'{self.save_name}_{key}_.csv'), index=False)
    
    def main(self, PM_data=None, save_path=None):
        if PM_data:
            self.PM_data = PM_data
        else:
            self.PM_data = load_selected_PM_data(self.data_path, self.selected_files)
        self.establish_df = self.made_establish_df()
        self.station_dict = self.fill_na_all_station(self.sort_columns(self.match_year_station()))
        self.new_PM_data = self.convert_to_original_yearfill()
        if save_path:
            self.save_to_csv(save_path=save_path)
        return self.new_PM_data
        

### Fill nan on each station by thier labels

In [7]:
class Fill_NaN_Label:
    def __init__(self, data_path, selected_years, label_df):
        self.data_path = data_path
        self.selected_years = selected_years
        self.label_df = label_df
        files_name_lst = [name for name in os.listdir(self.data_path) if name.startswith('PM2.5')] # Format PM2.5_(year)_.xlsx
        self.selected_files = []
        for y in self.selected_years:
            for f in files_name_lst:
                if str(y) in f:
                    self.selected_files.append(f)

        # Define Parameters
        self.new_PM_data = {}
        self.save_name = 'FillByProvide'

        # Pre-Define Function Parameters
        self.PM_data = None      

    @staticmethod
    def fill_na_all_label(label_dict):
        new_label_dict = {}
        for id, df in label_dict.items():
            if len(df.shape) > 1:
                for station_id in df.columns:
                    df_station = df[station_id].copy()
                    idx_for_cal = df.columns.to_list().copy()
                    idx_for_cal.pop(idx_for_cal.index(station_id))
                    fill_value = df[df_station.isna()][idx_for_cal].mean(axis=1)
                    full_df_station = df_station.fillna(fill_value)
                    df[station_id] = full_df_station
            new_label_dict[id] = df
        return new_label_dict
    
    @staticmethod
    def convert_to_original_labelfill(label_dict):
        new_df = label_dict['Date']
        for name_col in list(label_dict.keys())[1:]:
            new_df = pd.concat([new_df, label_dict[name_col]], axis=1)
        return new_df
    
    def match_label_station(self, PM_data_year):
        label_dict = {'Date':PM_data_year['Date']}
        station_columns = PM_data_year.columns[1:]
        for station_id in station_columns:
            the_label = self.label_df['label'][self.label_df['station_id'] == station_id].to_list()[0]
            if str(the_label) not in label_dict.keys():
                label_dict[str(the_label)] = PM_data_year[station_id]
            else:
                label_dict[str(the_label)] = pd.concat([label_dict[str(the_label)], PM_data_year[station_id]], axis=1)
        return label_dict

    def save_to_csv(self, save_path):
        for key, val in self.new_PM_data.items():
            val.to_csv(os.path.join(save_path, f'{self.save_name}_{key}_.csv'), index=False)   

    def main(self, PM_data=None, save_path=None):
        if PM_data:
            self.PM_data = PM_data
        else:
            self.PM_data = load_selected_PM_data(self.data_path, self.selected_files)
        for a_year, PM_data_year in self.PM_data.items():
            label_dict = self.match_label_station(PM_data_year)
            new_label_dict = self.fill_na_all_label(label_dict)
            self.new_PM_data[a_year] = self.convert_to_original_labelfill(new_label_dict)
        if save_path:
            self.save_to_csv(save_path)
        return self.new_PM_data


### Fill nan on each station by interpolation

In [8]:
class Fill_NaN_Interpolation:
    def __init__(self, data_path, selected_years):
        self.data_path = data_path
        self.selected_years = selected_years
        files_name_lst = [name for name in os.listdir(self.data_path) if name.startswith('PM2.5')] # Format PM2.5_(year)_.xlsx
        self.selected_files = []
        for y in self.selected_years:
            for f in files_name_lst:
                if str(y) in f:
                    self.selected_files.append(f)

        # Define Parameters
        self.inter_method = 'spline'
        self.order = 2
        self.new_PM_data = {}
        self.save_name = 'FillByInterpolation'
        self.print_progess = False

        # Pre-Define Function Parameters
        self.PM_data = None   

    def Interpolate_df(self, check_dict):
        new_PM_data = {}
        PM_keys = list(self.PM_data.keys())
        for idx in range(len(PM_keys)):
            if check_dict[PM_keys[idx]]:
                new_PM_data[PM_keys[idx]] = self.PM_data[PM_keys[idx]]
            else:
                if self.inter_method in ['spline', 'polynomial']:
                    new_PM_data[PM_keys[idx]] = self.PM_data[PM_keys[idx]].interpolate(self.inter_method, order=self.order)
                else:
                    new_PM_data[PM_keys[idx]] = self.PM_data[PM_keys[idx]].interpolate(self.inter_method)
        return new_PM_data
    
    def save_to_csv(self, save_path):
        for key, val in self.new_PM_data.items():
            val.to_csv(os.path.join(save_path, f'{self.save_name}_{key}_.csv'), index=False)
            # val.to_excel(os.path.join(save_path, f'{self.save_name}_{key}_.xlsx'), index=False)  
    
    def main(self, PM_data=None, save_path=None):
        if PM_data:
            self.PM_data = PM_data
        else:
            self.PM_data = load_selected_PM_data(self.data_path, self.selected_files)
        if self.print_progess:
            print('--------Before interpolating--------')
            check_dict = Check_for_NonMissing(self.PM_data)
            print('-------------------------------------')
            self.new_PM_data = self.Interpolate_df(check_dict)
            print('--------After interpolating--------')
            check_dict = Check_for_NonMissing(self.new_PM_data)
            print('-------------------------------------')
        else:
            check_dict = Check_for_NonMissing(self.PM_data, self.print_progess)
            self.new_PM_data = self.Interpolate_df(check_dict)
        if save_path:
            self.save_to_csv(save_path)
        return self.new_PM_data
            