## ライブラリのインポート

使用するライブラリを最初にインポートする  
最初にインポートするのは後から特定のセルを実行する際に，それより前のセルをすべて実行する必要をなくすためである

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

## データ読み込み

今回はExcelファイルで渡されたため、pandasのread_excel()で読み込む

In [3]:
train_raw_data = pd.read_excel('./data/train_data.xlsx')
train_raw_data.head()

Unnamed: 0,年,月,日,時,PM2.5濃度,PM10濃度,SO2濃度,NO2濃度,CO濃度,O3濃度,気温,気圧,露点温度,降水量,風向,風速
0,2013,3,1,0,3.0,6.0,3.0,8.0,300.0,44.0,-0.9,1025.8,-20.5,0.0,NW,9.3
1,2013,3,1,1,12.0,12.0,3.0,7.0,300.0,47.0,-1.1,1026.1,-21.3,0.0,NW,9.4
2,2013,3,1,2,14.0,14.0,,7.0,200.0,22.0,-1.7,1026.2,-23.0,0.0,NW,8.6
3,2013,3,1,3,12.0,12.0,3.0,5.0,,,-2.1,1027.3,-23.3,0.0,NW,6.6
4,2013,3,1,4,12.0,12.0,3.0,,200.0,11.0,-2.4,1027.7,-22.9,0.0,NW,4.5


## データクレンジング

データ提供型の共同研究では少し使いづらいデータ形式で渡されることがあるので，最初に自分が使いやすい形式に変換を行なう  
Excelソフトを使って手動で編集しても良いが，データ量が多いと処理に時間がかかったり，間違いがあったときに再度始めから行うのが面倒（マクロを使うのもあり）  
データ処理もスクリプト化して追加データにも対応可能にした方が良い

### 日本語を英数字に変換

日本語でも使用できるが面倒な時があるため英語に変換する  
長すぎず分かりやすい英数字表記にする  
DataFrame.columnsに直接代入しても良いが、分かりやすくrename()を使用する

In [4]:
jp_to_en = {
    '年':'year',
    '月':'month',
    '日':'day',
    '時':'hour',
    'PM2.5濃度':'PM25',
    'PM10濃度':'PM10',
    'SO2濃度':'SO2',
    'NO2濃度':'NO2',
    'CO濃度':'CO',
    'O3濃度':'O3',
    '気温':'temp',
    '気圧':'pres',
    '湿度':'humi',
    '露点温度':'dewp',
    '降水量':'prec',
    '風向':'wind_dire',
    '風速':'wd'
}
train_data = train_raw_data.copy()
train_data.rename(columns=jp_to_en, inplace=True)
train_data.head()

Unnamed: 0,year,month,day,hour,PM25,PM10,SO2,NO2,CO,O3,temp,pres,dewp,prec,wind_dire,wd
0,2013,3,1,0,3.0,6.0,3.0,8.0,300.0,44.0,-0.9,1025.8,-20.5,0.0,NW,9.3
1,2013,3,1,1,12.0,12.0,3.0,7.0,300.0,47.0,-1.1,1026.1,-21.3,0.0,NW,9.4
2,2013,3,1,2,14.0,14.0,,7.0,200.0,22.0,-1.7,1026.2,-23.0,0.0,NW,8.6
3,2013,3,1,3,12.0,12.0,3.0,5.0,,,-2.1,1027.3,-23.3,0.0,NW,6.6
4,2013,3,1,4,12.0,12.0,3.0,,200.0,11.0,-2.4,1027.7,-22.9,0.0,NW,4.5


In [5]:
train_data['datetime'] = pd.to_datetime(train_data[['year', 'month', 'day', 'hour']])
train_data.drop(['year', 'month', 'day', 'hour'], axis=1, inplace=True)
train_data.head()

Unnamed: 0,PM25,PM10,SO2,NO2,CO,O3,temp,pres,dewp,prec,wind_dire,wd,datetime
0,3.0,6.0,3.0,8.0,300.0,44.0,-0.9,1025.8,-20.5,0.0,NW,9.3,2013-03-01 00:00:00
1,12.0,12.0,3.0,7.0,300.0,47.0,-1.1,1026.1,-21.3,0.0,NW,9.4,2013-03-01 01:00:00
2,14.0,14.0,,7.0,200.0,22.0,-1.7,1026.2,-23.0,0.0,NW,8.6,2013-03-01 02:00:00
3,12.0,12.0,3.0,5.0,,,-2.1,1027.3,-23.3,0.0,NW,6.6,2013-03-01 03:00:00
4,12.0,12.0,3.0,,200.0,11.0,-2.4,1027.7,-22.9,0.0,NW,4.5,2013-03-01 04:00:00


In [6]:
train_data.set_index('datetime', inplace=True)
train_data.head()

Unnamed: 0_level_0,PM25,PM10,SO2,NO2,CO,O3,temp,pres,dewp,prec,wind_dire,wd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-03-01 00:00:00,3.0,6.0,3.0,8.0,300.0,44.0,-0.9,1025.8,-20.5,0.0,NW,9.3
2013-03-01 01:00:00,12.0,12.0,3.0,7.0,300.0,47.0,-1.1,1026.1,-21.3,0.0,NW,9.4
2013-03-01 02:00:00,14.0,14.0,,7.0,200.0,22.0,-1.7,1026.2,-23.0,0.0,NW,8.6
2013-03-01 03:00:00,12.0,12.0,3.0,5.0,,,-2.1,1027.3,-23.3,0.0,NW,6.6
2013-03-01 04:00:00,12.0,12.0,3.0,,200.0,11.0,-2.4,1027.7,-22.9,0.0,NW,4.5


In [7]:
train_data['PM25'].interpolate(inplace=True)

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from itertools import product
from tqdm.notebook import tqdm
from datetime import timedelta
from torch import nn
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.optim import lr_scheduler

In [9]:
le = LabelEncoder()
train_data['wind_dire'] = le.fit_transform(train_data['wind_dire'])

In [10]:
def preprocessing(train, test):
    test.dropna(how='any', inplace=True)    # テストデータは補完せずに全部除去

    # 離散値
    train['wind_dire'] = train['wind_dire'].fillna(method='ffill')  # 欠損値

    le = LabelEncoder() # ラベルエンコーダー
    train['wind_dire'] = le.fit_transform(train['wind_dire'])
    test['wind_dire'] = le.transform(test['wind_dire'])
    
    # 連続値
    tmp = train.drop('wind_dire', axis=1).interpolate().copy()  # 欠損値
    tmp['wind_dire'] = train['wind_dire']
    train = tmp.copy()
    train.dropna(how='any', inplace=True)   # temp_diffが必ず欠損値残るため除去

    # カラムの順番ずれるため統一する
    test = test[train.columns].copy()
    return train, test

In [11]:
train_data

Unnamed: 0_level_0,PM25,PM10,SO2,NO2,CO,O3,temp,pres,dewp,prec,wind_dire,wd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-03-01 00:00:00,3.0,6.0,3.0,8.0,300.0,44.0,-0.9,1025.8,-20.5,0.0,7,9.3
2013-03-01 01:00:00,12.0,12.0,3.0,7.0,300.0,47.0,-1.1,1026.1,-21.3,0.0,7,9.4
2013-03-01 02:00:00,14.0,14.0,,7.0,200.0,22.0,-1.7,1026.2,-23.0,0.0,7,8.6
2013-03-01 03:00:00,12.0,12.0,3.0,5.0,,,-2.1,1027.3,-23.3,0.0,7,6.6
2013-03-01 04:00:00,12.0,12.0,3.0,,200.0,11.0,-2.4,1027.7,-22.9,0.0,7,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-02-29 19:00:00,57.0,71.0,16.0,65.0,900.0,26.0,1.9,1026.6,-15.6,0.0,8,2.2
2016-02-29 20:00:00,70.0,78.0,23.0,64.0,1200.0,27.0,1.2,1026.3,-14.9,0.0,10,1.6
2016-02-29 21:00:00,76.0,79.0,26.0,79.0,1300.0,9.0,1.3,1026.1,-14.8,0.0,10,1.4
2016-02-29 22:00:00,78.0,80.0,24.0,73.0,1200.0,13.0,1.1,1026.0,-14.6,0.0,2,1.4


In [12]:
train_data.to_csv('./data/processed_pycaret_data.csv')

残りは学習モデルを使って予測を行っていく  
モデルの学習は./tutorial_model.ipynbで行う