In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import os

### Load Data

In [None]:
def load_PM_data(data_path):
    PM_data = {}
    data_name_lst = [name for name in os.listdir(data_path) if name.startswith('PM2.5')]
    for name in data_name_lst:
        a_df = pd.read_excel(os.path.join(data_path, name), sheet_name='PM2.5')
        PM_data[name.split('_')[1]] = a_df
    return PM_data

def load_selected_PM_data(data_path, selected_data_name):
    PM_data = {}
    data_name_lst = selected_data_name
    for name in data_name_lst:
        a_df = pd.read_excel(os.path.join(data_path, name), sheet_name='PM2.5')
        PM_data[name.split('_')[1]] = a_df
    return PM_data

### Observe the data

In [None]:
def check_new_station(name_lst, data_path, selected_data_name):
    updates = []
    for idx in range(len(name_lst)-1):
        a_update = []
        df0 = pd.read_excel(os.path.join(data_path, selected_data_name[idx]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        df1 = pd.read_excel(os.path.join(data_path, selected_data_name[idx+1]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        for id_station in df1:
            if id_station not in df0:
                a_update.append(id_station)
        updates.append(a_update)
    return updates

def check_station(ref_station, variable_station):
    ref_columns = [1]*len(ref_station)
    for idx, station_name in enumerate(ref_station):
        if station_name not in variable_station:
            ref_columns[idx] = 0
    return ref_columns

def made_locate_station(ref_name, var_name_lst, data_path):
    locate_lst = []
    ref_station= pd.read_excel(os.path.join(data_path, ref_name), sheet_name='station_detail')['รหัสสถานี'].to_list()
    for idx in range(len(var_name_lst)):
        variable_station = pd.read_excel(os.path.join(data_path, var_name_lst[idx]), sheet_name='station_detail')['รหัสสถานี'].to_list()
        locate_lst.append(check_station(ref_station, variable_station))
    return np.array(locate_lst), ref_station

### Fill nan on each station by thier recording

In [3]:
class Fill_NaN_YearRecord:

    def __init__(self, data_path, selected_years):
        self.data_path = data_path
        self.selected_years = selected_years
        files_name_lst = [name for name in os.listdir(self.data_path) if name.startswith('PM2.5')] # Format PM2.5_(year)_.xlsx
        self.selected_files = []
        for y in self.selected_years:
            for f in files_name_lst:
                if str(y) in f:
                    self.selected_files.append(f)
        self.ref_file = self.selected_files[-1]

        # Define Parameters
        self.save_name = 'fill_by_years'

        # Pre-Define Function Parameters
        self.PM_data = None
        self.establish_df = None 
        self.station_dict = None
        self.new_PM_data = None

    @staticmethod
    def sort_columns(dct):
        for key, val in dct.items():
            a = dct[key]
            columns_lst = ['Date']
            num_column = []
            for col_name in a.columns:
                try:
                    num_column.append(int(col_name))
                except Exception:
                    continue
            num_column = np.array(sorted(num_column)[::-1]).astype(str)
            a = a[np.append(columns_lst, num_column)]
            dct[key] = a
        return dct
    
    @staticmethod
    def fill_na_all_station(stations_dict):
        new_stations_dict = {}
        for key, df in stations_dict.items():
            if df.shape[1] > 2:
                for idx in range(1, len(df.columns)-1):
                    df_year = df[df.columns[idx]]
                    df_before_year = df[df_year.isna()][df.columns[idx+1:]]
                    fill_value = df_before_year.sum(axis=1) / (~df_before_year.isna()).sum(axis=1)
                    full_df_year = df_year.fillna(fill_value)
                    df[df.columns[idx]] = full_df_year
            new_stations_dict[key] = df
        return new_stations_dict

    def made_establish_df(self):
        locate_lst, ref_station = made_locate_station(self.ref_file, self.selected_files, self.data_path)
        df = pd.DataFrame(locate_lst, columns=ref_station)
        df_year = pd.DataFrame(self.selected_years, columns=['years'])
        df = df_year.join(df)
        df = df.loc[:,~df.columns.duplicated()].copy()
        return df
    
    def match_year_station(self):
        PM_data_work = self.PM_data.copy()
        station_lst = self.establish_df.columns[1:]
        stations_dict = {}
        for idx_station, now_station in enumerate(station_lst):
            year_station = self.establish_df['years'][self.establish_df[now_station] == 1].to_list()
            merge_switch = 0
            for a_year in year_station:
                a_year_df = PM_data_work[f'{a_year}'][[now_station]].rename(columns={now_station:f'{a_year}'})
                a_year_df['Date'] = PM_data_work[f'{a_year}']['Date'].apply(lambda x: datetime.strftime(x, '%m_%d'))
                if merge_switch == 0:
                    station_df = a_year_df
                else:
                    station_df = pd.merge(station_df, a_year_df, on='Date')
                merge_switch += 1
            stations_dict[now_station] = station_df
        return stations_dict
    
    def convert_to_original_yearfill(self):
        new_PM_data = {}
        for year_key, year_df in self.PM_data.items():
            print('Current :', year_key)
            uniform_df = pd.DataFrame({'Date': year_df['Date'].apply(lambda x: datetime.strftime(x, '%m_%d'))})
            for station_key, station_df in self.station_dict.items():
                if year_key in station_df.columns:
                    station_in_year = station_df[['Date', year_key]].rename(columns={year_key:station_key}).drop_duplicates(subset=['Date'])
                    uniform_df = pd.merge(uniform_df, station_in_year, on='Date')
            new_PM_data[year_key] = uniform_df
        return new_PM_data
    
    def save_to_csv(self, save_path):
        for key, val in self.new_PM_data.items():
            val['Date'] = val['Date'].apply(lambda x: key + '_' + x)
            val.to_csv(os.path.join(save_path, f'{self.save_name}.{key}.csv'), index=False)
    
    def main(self, save_path=None):
        self.PM_data = load_selected_PM_data(self.data_path, self.selected_files)
        self.establish_df = self.made_establish_df()
        # print(self.establish_df)
        self.station_dict = self.fill_na_all_station(self.sort_columns(self.match_year_station()))
        self.new_PM_data = self.convert_to_original_yearfill()
        if save_path:
            self.save_to_csv(save_path=save_path)
        return self.new_PM_data
        