In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import font_manager
import matplotlib.pyplot as plt
import os
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# 분 단위 데이터를 한시간 단위로 그룹핑

In [3]:
def grouping(db_name,ms_name):
    address="./RawData/"+db_name+'/'+ms_name
    saved_address="./RawData/"+db_name+'/1hour/'+ms_name
    
    data=pd.read_csv(address)
    data=pd.DataFrame(data)
    
    data=data.drop(['in_noise','in_pm01','in_pm25'],axis=1)
    
    data['time'] = pd.to_datetime(data['time'])
    data.set_index('time', inplace=True)
    
    data_new = pd.DataFrame()
    column=data.columns
    
    for i in range(len(column)):
        what=column[i]
        data_new[what]=data[what].resample(rule='1H').mean()
    
    data_new.to_csv(saved_address,header=True)
    
    return

In [4]:
db_names=['air_indoor_도서관','air_indoor_중학교','air_indoor_체육시설','air_indoor_초등학교']

for i in range(len(db_names)):
    db_name=db_names[i]
    ms_list=os.listdir('./RawData/'+db_name)
    for j in range(len(ms_list)):
        ms_name=ms_list[j]
        if(ms_name.endswith(".csv")):
            grouping(db_name,ms_name)

# 이상치 처리 기준 산정

## Grouping한 csv파일들 이름 load

In [5]:
csv_list1=[]
csv_list2=[]
csv_list3=[]
csv_list4=[]

def get_csv_list(db_name,csv_list):
    list1=os.listdir("./RawData/"+db_name+"/1hour")
    for i in range(len(list1)):
        list1[i]="./RawData/"+db_name+"/1hour/"+list1[i]
    csv_list+=list1
    return csv_list

In [6]:
csv_list1=get_csv_list('air_indoor_도서관',csv_list1)
csv_list2=get_csv_list('air_indoor_중학교',csv_list2)
csv_list3=get_csv_list('air_indoor_체육시설',csv_list3)
csv_list4=get_csv_list('air_indoor_초등학교',csv_list4)

## 시설별 데이터를 하나의 파일로 병합

In [7]:
def concat_all(db_name,csv_list):
    data=pd.DataFrame()
    
    for csv_file in csv_list:
        file=pd.read_csv(csv_file)
        file=pd.DataFrame(file)
        
        data=pd.concat([data,file])
        
    data.to_csv("./RawData/All_Grouped({:s}).csv".format(db_name),index=False)
    
    return data.columns

In [8]:
columms1=concat_all('air_indoor_도서관',csv_list1)
columms2=concat_all('air_indoor_중학교',csv_list2)
columms3=concat_all('air_indoor_체육시설',csv_list3)
columms4=concat_all('air_indoor_초등학교',csv_list4)

## 병합된 파일로부터 사분위 범위의 최소/최대값 계산

In [9]:
in_co2_upper=[]
in_pm10_upper=[]
in_voc_upper=[]

In [10]:
font_path = "./NanumGothicBold.ttf"
fontprop = font_manager.FontProperties(fname=font_path)

def get_boxplot_all(db_name,what,list):
    address = "./RawData/All_Grouped({:s}).csv".format(db_name)
    file = pd.read_csv(address)

    name = pd.to_numeric(file[what], errors='coerce')

    
    # Calculate Q1, Q3, and IQR
    Q1 = np.percentile(name.dropna(), 25)
    Q3 = np.percentile(name.dropna(), 75)
    IQR = Q3 - Q1

    # Calculate lower and upper bound
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    fig, ax = plt.subplots(figsize=(24, 24))
    sns.set_theme(style='whitegrid')
    sns.boxplot(data=name.dropna(), ax=ax)
    ax.set_title('The Distribution of ' + what, fontproperties=fontprop,fontsize=30,fontweight=1000)
    #ax.set_ylim(-10, 100)
    #plt.yticks((-10,0,10,20,30,40,50,60,70,80,90,100),fontsize=24,fontweight="bold")
    
    # Add text for lower and upper bound
    ax.text(0.02, lower_bound, f'Lower Bound: {lower_bound:.2f}', color='red',fontsize=36,fontweight="bold")
    ax.text(0.02, upper_bound, f'Upper Bound: {upper_bound:.2f}', color='red',fontsize=36,fontweight="bold")
    
    list.append(upper_bound)
    
    plt.savefig("./IQR/IQR({:s},{:s}).png".format(db_name,what))
    plt.close()
    
    return 

In [11]:
db_names=['air_indoor_도서관','air_indoor_중학교','air_indoor_체육시설','air_indoor_초등학교']
whats=['in_co2', 'in_pm10',  'in_voc']

list_upper=[in_co2_upper,in_pm10_upper,in_voc_upper]

for db_name in db_names:
    i=0
    for what in whats:
        get_boxplot_all(db_name,what,list_upper[i])
        i+=1

#### 결과 : in_co2(811.33), in_pm10(60.84), in_voc(1665.09)

In [12]:
print("in_co2 : ",max(in_co2_upper))
print("in_pm10 : ",max(in_pm10_upper))
print("in_voc : ",max(in_voc_upper))

in_co2 :  811.3333333333334
in_pm10 :  60.83527542372882
in_voc :  1665.0862499999998


# 이상치와 누락된 시간행 처리

## Grouping한 csv파일들 경로 load

In [13]:
csv_list=[]
new_address=[]

def get_csv_list(db_name,csv_list,new_address):
    list1=os.listdir("./RawData/{:s}/1hour".format(db_name))
    for i in range(len(list1)):
        csv_list.append("./RawData/{:s}/1hour/{:s}".format(db_name,list1[i]))
        new_address.append("./Grouped_Data/{:s}/{:s}/{:s}".format(db_name,'Missing_Value(All)',list1[i]))

    return csv_list,new_address

In [14]:
get_csv_list('air_indoor_도서관',csv_list,new_address)
get_csv_list('air_indoor_중학교',csv_list,new_address)
get_csv_list('air_indoor_체육시설',csv_list,new_address)
get_csv_list('air_indoor_초등학교',csv_list,new_address)

(['./RawData/air_indoor_도서관/1hour/ICW0W2000087.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000088.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000089.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000094.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000095.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000096.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000097.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000098.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000099.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000100.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000101.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000102.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000108.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000128.csv',
  './RawData/air_indoor_도서관/1hour/ICW0W2000129.csv',
  './RawData/air_indoor_중학교/1hour/ICW0W2000010.csv',
  './RawData/air_indoor_중학교/1hour/ICW0W2000011.csv',
  './RawData/air_indoor_중학교/1hour/ICW0W2000013.csv',
  './RawData/air_indoor_중학교/1hour/ICW0W2000014

## 이상치를 none(결측치)로 처리하기

In [15]:
def convert_to_None(address1,address2,whats,max_points):
    
    file=pd.read_csv(address1)
    
    for what, Max_Point in zip(whats, max_points):
        file[what] = pd.to_numeric(file[what], errors='coerce')
        file.loc[(file[what] > Max_Point) | (file[what] <= 0), what] = None
    
    file.to_csv(address2, index=False)
    
    return

In [16]:
whats = ['in_co2', 'in_pm10', 'in_voc']
max_points = [float(max(in_co2_upper)), float(max(in_pm10_upper)), float(max(in_voc_upper))]

for address1, address2 in zip(csv_list, new_address):
    convert_to_None(address1, address2, whats, max_points)

## 누락된 시간행 탐색하는 함수

In [17]:
from dateutil.parser import parse

def Nan_time_check(address):
    file = pd.read_csv(address)

    start_time=file['time'][0]
    end_time=file['time'][len(file)-1]
    
    # 'time' 열을 datetime 형식으로 변환
    file['time'] = file['time'].apply(lambda x: parse(x))

    start_time = parse(start_time)
    end_time = parse(end_time)

    expected_rows = pd.DataFrame(pd.date_range(start=start_time, end=end_time, freq='1H'), columns=['time'])

    missing_rows = expected_rows[~expected_rows['time'].isin(file['time'])]


    return missing_rows

## 누락된 시간행 채워넣는 함수

In [18]:
def merge_and_sort_files(address):
    
    file_Nan = pd.read_csv(address)
    file_Nan = pd.DataFrame(file_Nan)
    
    missing_rows=Nan_time_check(address)
    missing_rows=pd.DataFrame(missing_rows)

    missing_rows['time']=None
    missing_rows['in_co2']=None
    missing_rows['in_humi']=None
    missing_rows['in_pm10']=None
    missing_rows['in_temp']=None
    missing_rows['in_voc']=None
    
    merged_file = pd.concat([file_Nan, missing_rows])
    merged_file['time'] = pd.to_datetime(merged_file['time'])
    merged_file.sort_values('time', inplace=True)
    merged_file.to_csv(address, index=False, header=True)
    
    return

In [19]:
for address in new_address:
    merge_and_sort_files(address)

# 결측치에 대해 선형보간 적용

## Interpolation 적용할 파일들 경로 load

In [20]:
csv_list=[]
new_address=[]

def get_csv_list(db_name,csv_list,new_address):
    list1=os.listdir("./Grouped_Data/{:s}/Missing_Value(All)/".format(db_name))
    
    for file in list1:
        csv_list.append("./Grouped_Data/{:s}/Missing_Value(All)/{:s}".format(db_name,file))
        new_address.append("./Grouped_Data/{:s}/{:s}/{:s}".format(db_name,'Interpolated',file))

    return csv_list,new_address

In [21]:
get_csv_list('air_indoor_도서관',csv_list,new_address)
get_csv_list('air_indoor_중학교',csv_list,new_address)
get_csv_list('air_indoor_체육시설',csv_list,new_address)
get_csv_list('air_indoor_초등학교',csv_list,new_address)

(['./Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000087.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000088.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000089.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000094.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000095.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000096.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000097.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000098.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000099.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000100.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000101.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000102.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000108.csv',
  './Grouped_Data/air_indoor_도서관/Missing_Value(All)/ICW0W2000128.csv',
  './G

## Linear Interpolation 적용

In [22]:
def Nan_interpolation(csv_file,new_directory):
    
    file = pd.read_csv(csv_file)

    in_co2_values = pd.to_numeric(file['in_co2'])
    in_humi_values = pd.to_numeric(file['in_humi'])
    in_pm10_values = pd.to_numeric(file['in_pm10'])
    in_temp_values = pd.to_numeric(file['in_temp'])
    in_voc_values = pd.to_numeric(file['in_voc'])

    file['in_co2']=in_co2_values.interpolate(method='linear', limit_direction='both')
    file['in_humi']=in_humi_values.interpolate(method='linear', limit_direction='both')
    file['in_pm10']=in_pm10_values.interpolate(method='linear', limit_direction='both')
    file['in_temp']=in_temp_values.interpolate(method='linear', limit_direction='both')
    file['in_voc']=in_voc_values.interpolate(method='linear', limit_direction='both')
    
    file.to_csv(new_directory, index=False)

    return

In [23]:
for csv_file, new_directory in zip(csv_list, new_address):
    Nan_interpolation(csv_file,new_directory)

# 한시간 후의 PM10값을 Y값으로 Mapping

## 적용할 파일의 경로 load

In [24]:
csv_list=[]
new_addresses=[]

def get_csv_list(db_name,csv_list,new_addresses):
    list1=os.listdir("./Grouped_Data/{:s}/Interpolated/".format(db_name))
    
    for file in list1:
        csv_list.append("./Grouped_Data/{:s}/Interpolated/{:s}".format(db_name,file))
        new_addresses.append("./Grouped_Data/{:s}/Mapped/{:s}".format(db_name,file))
        
    return csv_list,new_addresses

In [25]:
get_csv_list('air_indoor_도서관',csv_list,new_addresses)
get_csv_list('air_indoor_중학교',csv_list,new_addresses)
get_csv_list('air_indoor_체육시설',csv_list,new_addresses)
get_csv_list('air_indoor_초등학교',csv_list,new_addresses)

(['./Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000087.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000088.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000089.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000094.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000095.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000096.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000097.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000098.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000099.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000100.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000101.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000102.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000108.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000128.csv',
  './Grouped_Data/air_indoor_도서관/Interpolated/ICW0W2000129.csv',
  './Grouped_Data/air_ind

## Y값 Mapping

In [26]:
def y_mapping(address,new_address):
    file=pd.read_csv(address)
    
    pm_10_data = file['in_pm10']
    new_row_data = [pm_10_data.iloc[i +1] if i + 1 < len(pm_10_data) else None for i in range(len(file))]
    
    file['Y_pm10'] = new_row_data
    file = file.iloc[:-1]        
    
    file.to_csv(new_address, index=False, header=True)
    
    return

In [27]:
for address,new_address in zip(csv_list,new_addresses):
    y_mapping(address,new_address)

# 시설 유형별로 가장 긴 샘플들 병합

In [28]:
def file_merge(address):
    new=pd.DataFrame()
    for file in address:
        data=pd.read_csv(file)
        new=pd.concat([new,data])

    new.to_csv("./Merged_Data/By_Length.csv",index=False)
    
    return new

In [29]:
address=['./Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000094.csv',
        './Grouped_Data/air_indoor_초등학교/Mapped/ICW0W2000022.csv',
        './Grouped_Data/air_indoor_체육시설/Mapped/ICW0W2001037.csv',
        './Grouped_Data/air_indoor_중학교/Mapped/ICW0W2000020.csv']

merged_data=file_merge(address)

## scaler 훈련&저장 , scaler 적용된 파일을 별도의 경로에 저장

In [30]:
def train_scaler(new,columm_list):
    file=pd.DataFrame()
    file["time"]=new["time"]
    for columm in columm_list:
        scaler=MinMaxScaler().fit(np.array(new[columm]).reshape(-1,1))
        joblib.dump(scaler,"./Merged_Data/Scaler/{:s}.pkl".format(columm))
        file[columm]=scaler.transform(np.array(new[columm]).reshape(-1,1))
    file.to_csv("./Merged_Data/{:s}.csv".format('Scaled'),index=False)
    
    return file

In [31]:
columm_list=[]
columm_list=merged_data.columns
columm_list=columm_list.drop("time")

train_scaler(merged_data,columm_list)

Unnamed: 0,time,in_co2,in_humi,in_pm10,in_temp,in_voc,Y_pm10
0,2020-06-01 00:00:00+00:00,0.389964,0.473001,0.240888,0.719710,0.004820,0.200603
1,2020-06-01 01:00:00+00:00,0.385841,0.486241,0.200603,0.713185,0.007584,0.171401
2,2020-06-01 02:00:00+00:00,0.382704,0.497501,0.171401,0.705381,0.004179,0.155385
3,2020-06-01 03:00:00+00:00,0.379933,0.507269,0.155385,0.698556,0.003278,0.143053
4,2020-06-01 04:00:00+00:00,0.379093,0.518951,0.143053,0.694177,0.010778,0.123596
...,...,...,...,...,...,...,...
6618,2023-01-31 18:00:00+00:00,0.117454,0.345535,0.185530,0.188019,0.025577,0.169087
6619,2023-01-31 19:00:00+00:00,0.109944,0.349948,0.169087,0.187493,0.026248,0.158674
6620,2023-01-31 20:00:00+00:00,0.101463,0.355140,0.158674,0.188676,0.027310,0.152645
6621,2023-01-31 21:00:00+00:00,0.090513,0.354361,0.152645,0.186617,0.026228,0.154563


# 병합한 데이터 샘플을 8:1:1(train:valid:test) 비율로 split

## 적용할 경로 load

In [32]:
address='./Merged_Data/Scaled.csv'
trainx_address='./Merged_Data/train_x.csv'
testx_address='./Merged_Data/test_x.csv'
trainy_address='./Merged_Data/train_y.csv'
testy_address='./Merged_Data/test_y.csv'
validx_address='./Merged_Data/valid_x.csv'
validy_address='./Merged_Data/valid_y.csv'

## split 적용

In [33]:
def data_split(address,trainx_address,testx_address,validx_address,trainy_address,testy_address,validy_address):
    file=pd.read_csv(address)
    data=pd.DataFrame(file)
    
    data_x=data.drop('Y_pm10',axis=1)
    data_y=data['Y_pm10']
    
    train_x,test_x,train_y,test_y=train_test_split(data_x,data_y,test_size=0.1,shuffle=False)
    valid_x,test_x,valid_y,test_y=train_test_split(test_x,test_y,test_size=0.5,shuffle=False)
    
    train_x=pd.DataFrame(train_x)
    test_x=pd.DataFrame(test_x)
    valid_x=pd.DataFrame(valid_x)
    train_y=pd.DataFrame(train_y)
    test_y=pd.DataFrame(test_y)
    valid_y=pd.DataFrame(valid_y)
    
    train_x.to_csv(trainx_address,index=False,header=True)
    test_x.to_csv(testx_address,index=False,header=True)
    valid_x.to_csv(validx_address,index=False,header=True)
    train_y.to_csv(trainy_address,index=False,header=True)
    test_y.to_csv(testy_address,index=False,header=True)
    valid_y.to_csv(validy_address,index=False,header=True)
    
    return train_x,test_x,valid_x,train_y,test_y,valid_y

In [34]:
train_x,test_x,valid_x,train_y,test_y,valid_y=data_split(address,trainx_address,testx_address,validx_address,trainy_address,testy_address,validy_address)

# STL 기법을 이용한 시계열 분해 

## 병합된 파일에 적용

In [35]:
data_path = './Merged_Data'
df = pd.read_csv(os.path.join(data_path, 'By_Length.csv'), engine='python')

res = sm.tsa.seasonal_decompose(df.in_pm10, period=7, extrapolate_trend=1)

df2=df.drop("in_pm10",axis=1)
df2['in_pm10_Seasonal_Trend'] = res.seasonal+res.trend
df2.head()

df2.to_csv("./Merged_Data/Seasonal_Trend.csv",index=False)

## train,valid,test set에도 적용

In [36]:
def train_scaler(data):
    scaler=MinMaxScaler().fit(np.array(data["in_pm10_Seasonal_Trend"]).reshape(-1,1))
    joblib.dump(scaler,"./Merged_Data/Scaler/{:s}.pkl".format("in_pm10_Seasonal_Trend"))
    
    scaled=scaler.transform(np.array(data["in_pm10_Seasonal_Trend"]).reshape(-1,1))
    
    new1=pd.read_csv("./Merged_Data/train_x.csv").drop("in_pm10",axis=1)
    length1=len(new1)
    new1["in_pm10_Seasonal_Trend"]=scaled[:length1]
    
    new2=pd.read_csv("./Merged_Data/valid_x.csv").drop("in_pm10",axis=1)
    length2=length1+len(new2)
    new2["in_pm10_Seasonal_Trend"]=scaled[length1:length2]
    
    new3=pd.read_csv("./Merged_Data/test_x.csv").drop("in_pm10",axis=1)
    new3["in_pm10_Seasonal_Trend"]=scaled[length2:]
    
    
    new1.to_csv("./Merged_Data/train_x({:s}).csv".format('Seasonal_Trend'),index=False)
    new2.to_csv("./Merged_Data/valid_x({:s}).csv".format('Seasonal_Trend'),index=False)
    new3.to_csv("./Merged_Data/test_x({:s}).csv".format('Seasonal_Trend'),index=False)
    
    return new1,new2,new3

In [37]:
train_scaler(df2)

(                            time    in_co2   in_humi   in_temp    in_voc  \
 0      2020-06-01 00:00:00+00:00  0.389964  0.473001  0.719710  0.004820   
 1      2020-06-01 01:00:00+00:00  0.385841  0.486241  0.713185  0.007584   
 2      2020-06-01 02:00:00+00:00  0.382704  0.497501  0.705381  0.004179   
 3      2020-06-01 03:00:00+00:00  0.379933  0.507269  0.698556  0.003278   
 4      2020-06-01 04:00:00+00:00  0.379093  0.518951  0.694177  0.010778   
 ...                          ...       ...       ...       ...       ...   
 55222  2022-05-21 01:00:00+00:00  0.072552  0.519730  0.595503  0.021752   
 55223  2022-05-21 02:00:00+00:00  0.065173  0.525961  0.594101  0.021802   
 55224  2022-05-21 03:00:00+00:00  0.069322  0.536604  0.592569  0.022744   
 55225  2022-05-21 04:00:00+00:00  0.075152  0.538422  0.591430  0.023024   
 55226  2022-05-21 05:00:00+00:00  0.077883  0.538422  0.590598  0.023204   
 
        in_pm10_Seasonal_Trend  
 0                    0.226442  
 1      

## 적용할 파일의 경로 load

In [38]:
csv_list=[]
new_csv_list=[]

def get_csv_list(db_name,csv_list,new_csv_list):
    list1=os.listdir("./Grouped_Data/{:s}/Mapped/".format(db_name))
    
    for file in list1:
        csv_list.append("./Grouped_Data/{:s}/Mapped/{:s}".format(db_name,file))
        new_csv_list.append("./Grouped_Data/{:s}/Seasonal_Trend/{:s}".format(db_name,file))
        
        if not os.path.exists("./Grouped_Data/{:s}/Seasonal_Trend/".format(db_name)):
            os.makedirs("./Grouped_Data/{:s}/Seasonal_Trend/".format(db_name))
            
    return csv_list,new_csv_list

In [39]:
get_csv_list('air_indoor_도서관',csv_list,new_csv_list)
get_csv_list('air_indoor_중학교',csv_list,new_csv_list)
get_csv_list('air_indoor_체육시설',csv_list,new_csv_list)
get_csv_list('air_indoor_초등학교',csv_list,new_csv_list)

(['./Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000087.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000088.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000089.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000094.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000095.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000096.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000097.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000098.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000099.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000100.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000101.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000102.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000108.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000128.csv',
  './Grouped_Data/air_indoor_도서관/Mapped/ICW0W2000129.csv',
  './Grouped_Data/air_indoor_중학교/Mapped/ICW0W2000010.csv',
  './Grouped_Data/air_indoor_중학교/Mapped/ICW0W2000011.csv

## 모든 샘플에 대하여 적용하고 scaler 저장

In [40]:
def seasonal_grouped(csv_file,new_csv_file):
    df=pd.read_csv(csv_file)
    
    res = sm.tsa.seasonal_decompose(df.in_pm10, period=7, extrapolate_trend=1)
    df["in_pm10_Seasonal_Trend"]=res.seasonal+res.trend
    
    df=df.drop("in_pm10",axis=1)
    df.to_csv(new_csv_file,index=False)
    
    return

In [41]:
for csv_file,new_csv_file in zip(csv_list,new_csv_list):
    seasonal_grouped(csv_file,new_csv_file)