In [1]:
import datetime as dt
import re
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from pytorch_tabnet.pretraining import TabNetPretrainer
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GroupKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering
from geopy.distance import geodesic
#import pandas_profiling as pdp
import optuna
import mlflow
import pickle
import torch

In [2]:
# experiment_name = 'Signate_Sony_pm2.5' # 記録用
experiment_name = 'tmp' # 試行錯誤用


mlflow.set_experiment(experiment_name)
run = mlflow.start_run()
run_id = run.info.run_id
print("Active run_id: {}".format(run_id))

Active run_id: 268ced833f134275b33f04ddd7b338e9


In [3]:
class Const:
    '''
    定数用のクラス。定数はココでのみ定義することとする。
    mlflowで記録するのは、使うときだけ。
    '''
    # lgbm関連
    do_use_saved_lgbmmodel = False
    do_tuning_hypara_lgbm = False
    do_training_lgbm = True
    do_predict_lgbm = True
    do_use_bestparam_lgbm = False
    cv_n_splits_lgbm = 10
    hypara_cv_n_splits_lgbm = 5
    do_save_lgbmmodel = True
    # tabnet関連
    do_use_saved_tabnetmodel = False
    do_tuning_hypara_tabnet = False
    do_pretraining = False
    do_training_tabnet = False
    do_predict_tabnet = False
    do_use_bestparam_tabnet = False
    cv_n_splits_tabnet = 10
    hypara_cv_n_splits_tabnet = 5
    do_save_tabnetmodel = False
    # アンサンブル関連
    weight_lgbm = 0.5
    weight_tabnet = 0.5
    # その他
    do_submit = True

In [4]:
# 初期化
y_pred_lgbm = None
y_pred_tabnet = None

In [5]:
trainpath =  './01_data/train.csv'
testpath =  './01_data/test.csv'

train_df = pd.read_csv(trainpath)
test_df = pd.read_csv(testpath)

## preprocessing

In [6]:
def label_encoding(df):
    types = df.dtypes
    for col in df.columns:
        if types[col] == 'category':
            print(col)
            l_enc = LabelEncoder()
            df[col] = l_enc.fit_transform(df[col].values)
    return df

def onehot_encoding(df):
    df_ohe = pd.DataFrame([])
    cond = df.dtypes[df.dtypes == 'category'].index
    df_ohe = df[cond]
    df_ohe = pd.get_dummies(df_ohe)
    df = df.drop(cond, axis=1)
    df = pd.concat([df, df_ohe], axis=1)
    return df

def commom_preprocessing(df):
    # dtypeの変換
    df['Country'] = df['Country'].astype('category')
    df['City'] = df['City'].astype('category')
    df['year'] = df['year'].astype('category')
    return df

def lgbm_preprocessing(df):
    return df

def tabnet_preprocessing(df):
    df = label_encoding(df)
    # df = onehot_encoding(df)
    return df

In [7]:
train_df = commom_preprocessing(train_df)
test_df = commom_preprocessing(test_df)

In [8]:
def add_cluster(train_df, test_df, n_clusters, linkage):
    '''
    Args:
        train_dfもtest_dfもある程度前処理を終わったものを渡す。
    Returns:
        train_df及びtest_df
    '''
    train_df['train'] = True
    test_df['train'] = False
    all_df = pd.concat([train_df, test_df])
    
    cities_df = pd.concat([train_df, test_df])[['Country', 'City', 'lat', 'lon']].drop_duplicates()
    cities_df['spot'] = cities_df[['lat', 'lon']].apply(lambda x: (x[0], x[1]), axis=1)
    world_cities = cities_df.sort_values('lat', ascending=False).reset_index(drop=True)    
    n_world = len(world_cities)
    print('n_world =', n_world)
    # 都市間の距離行列を作る
    world_mtx = np.zeros(n_world**2).reshape(n_world, n_world)
    for i in range(n_world):
        for j in range(n_world):
            world_mtx[i,j] = geodesic(world_cities.at[i, 'spot'], world_cities.at[j, 'spot']).km # 距離の算出

    clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage).fit(world_mtx)
    # 都市名、緯度、経度、クラスター番号を格納したDataFrameを作成
    cluster_df = world_cities[['City', 'lat', 'lon']].copy()
    cluster_df['cluster'] = clustering.labels_
    # display(cluster_df)
    g = cluster_df[['City', 'cluster']].set_index('City')['cluster']
    all_df['cluster'] = all_df['City'].map(g)
    # display(all_df)    
    train_df = all_df[all_df['train']==True]
    test_df = all_df[all_df['train']==False]

    return train_df, test_df

def add_country_month_aggregation(train_df, test_df):
    
    def rename_multicol(df):
        df_col=df.columns #列名をコピー
        df = df.T.reset_index(drop=False).T #一回列名をリセット
        for  i in range(df.shape[1]): #列名を新たに定義
            rename_col = {i:"_Country_month_".join(df_col[i])}
            df = df.rename(columns = rename_col)     
        df = df.drop(["level_0","level_1"],axis=0)
        return df
    
    train_df['train'] = True
    test_df['train'] = False
    all_df = pd.concat([train_df, test_df])
    agg_columns = [c for c in train_df.columns if re.search(r'_max|_mid|_min|_var', c) and not re.search(r'pm25', c)]
    agg_columns_plus = ['Country', 'month'] + agg_columns
    agg_df = all_df[agg_columns_plus]
    group_df = agg_df.groupby(['Country', 'month']).agg([np.mean, np.max, np.min, np.std])
    group_df = rename_multicol(group_df)
    
    # print(len(group_df.columns))
    # for c in group_df.columns:
    #     print(c)
    
    for col in group_df.columns:
        group_df[col] = group_df[col].astype('float64')
    group_df_columns = group_df.columns
    out_df = pd.merge(all_df, group_df, how='left', on=['Country', 'month'])
    
    train_df = out_df[out_df['train']==True]
    test_df = out_df[out_df['train']==False]
    return train_df, test_df, group_df_columns
    
def add_feature_common(df):
    # 北半球 or 南半球
    # df['hemisphere'] = (df['lat'] >= 0)
    # # 通算日の計算
    df['date'] = df['year'].astype(str)+'-'+df['month'].astype(str)+'-'+df['day'].astype(str)
    df['date'] = pd.to_datetime(df['date'])
    # df['date_boy'] = pd.to_datetime(df['year'].astype(str)+'-01-01') # beginning of the year
    # df['totaldate'] = df['date'] - df['date_boy']
    df['totaldate'] = df['date'].dt.strftime("%j").astype(int)
    
    # # 国別の平均CO中央値
    # mean_co_mid = df.groupby('Country')['co_mid'].mean()
    # df['country_average_co_mid'] = df['Country'].map(mean_co_mid).astype('float64')
    # # 国別の最大CO中央値
    # max_co_mid = df.groupby('Country')['co_mid'].max()
    # df['country_max_co_mid'] = df['Country'].map(max_co_mid).astype('float64')
    # # 国別の最小CO中央値
    # min_co_mid = df.groupby('Country')['co_mid'].min()
    # df['country_min_co_mid'] = df['Country'].map(min_co_mid).astype('float64')
    
    return df

def add_GDP(df):
    return 

def add_feature_train(df):
    # 国別の平均PM2.5中央値
    # 参考：https://teratail.com/questions/204630
    mean_pm25 = df.groupby('Country')['pm25_mid'].mean()
    df['country_average_pm25_mid'] = df['Country'].map(mean_pm25).astype('float64')
    # 国別の最大PM2.5中央値
    max_pm25 = df.groupby('Country')['pm25_mid'].max()
    df['country_max_pm25_mid'] = df['Country'].map(max_pm25).astype('float64')
    # 国別の最小PM2.5中央値
    min_pm25 = df.groupby('Country')['pm25_mid'].min()
    df['country_min_pm25_mid'] = df['Country'].map(min_pm25).astype('float64')
    
    return df
    
def add_feature_test(df):
    # 国別の平均PM2.5中央値
    mean_pm25 = train_df.groupby('Country')['pm25_mid'].mean()
    df['country_average_pm25_mid'] = df['Country'].map(mean_pm25).astype('float64')
    # 国別の最大PM2.5中央値
    max_pm25 = train_df.groupby('Country')['pm25_mid'].max()
    df['country_max_pm25_mid'] = df['Country'].map(max_pm25).astype('float64')
    # 国別の最小PM2.5中央値
    min_pm25 = train_df.groupby('Country')['pm25_mid'].min()
    df['country_min_pm25_mid'] = df['Country'].map(min_pm25).astype('float64')
    
    return df

In [9]:
# train_df, test_df = add_cluster(
#     train_df,
#     test_df,
#     n_clusters = 150,
#     linkage = 'single'
# )

In [10]:
train_df = add_feature_common(train_df)
test_df = add_feature_common(test_df)
train_df, test_df, group_df_columns = add_country_month_aggregation(train_df, test_df)

# train_df = add_feature_train(train_df)
# test_df = add_feature_test(test_df)

train_df['Country'] = train_df['Country'].astype('category')
test_df['Country'] = test_df['Country'].astype('category')

  df = df.drop(["level_0","level_1"],axis=0)


In [11]:
features = [
#    'id',
#    'year',
#    'month',
#    'day',
    'Country',
#    'City',
#    'cluster',
    'lat',
    'lon',
    'co_cnt',
    'co_min',
    'co_mid',
    'co_max',
    'co_var',
    'o3_cnt',
    'o3_min',
    'o3_mid',
    'o3_max',
    'o3_var',
    'so2_cnt',
    'so2_min',
    'so2_mid',
    'so2_max',
    'so2_var',
    'no2_cnt',
    'no2_min',
    'no2_mid',
    'no2_max',
    'no2_var',
    'temperature_cnt',
    'temperature_min',
    'temperature_mid',
    'temperature_max',
    'temperature_var',
    'humidity_cnt',
    'humidity_min',
    'humidity_mid',
    'humidity_max',
    'humidity_var',
    'pressure_cnt',
    'pressure_min',
    'pressure_mid',
    'pressure_max',
    'pressure_var',
    'ws_cnt',
    'ws_min',
    'ws_mid',
    'ws_max',
    'ws_var',
    'dew_cnt',
    'dew_min',
    'dew_mid',
    'dew_max',
    'dew_var',
    #'pm25_mid',
    
    # additional feature
    #'hemisphere', # 全く効いてない。
    'totaldate',
    #'country_average_pm25_mid', # あまり効いてない。
    #'country_max_pm25_mid',
    #'country_min_pm25_mid',
    #'country_max_co_mid',
    #'country_min_co_mid',
    
    *group_df_columns,

]
mlflow.log_param('features', features)

In [12]:
X = train_df[features]
Y = train_df['pm25_mid']

In [13]:
X.shape

(195941, 193)

## To use or not use pretrained model

In [14]:
if Const.do_use_saved_lgbmmodel:
    modelname = 'model_id_bbd6408b112a45a383ef7543aae38945.pickle'
    path = f'models/lgbm/{modelname}'
    with open(path, 'rb') as f:
        models_lgbm = pickle.load(f)

if Const.do_use_saved_tabnetmodel:
    # modelname = 'model_id_7f4d511735a7449fb9a4b16e35be8773.pickle'
    modelname = 'model_id_d466589a2fb3443eb2bbb83be454db84.pickle'
    path = f'models/tabnet/{modelname}'
    with open(path, 'rb') as f:
        models_tabnet = pickle.load(f)

---
## training with LightGBM

In [15]:
# Hyper-parameter tuning
def opt_lgbm(trial):
    '''
    hyper-parameter tuning with optuna.
    
    Ref:
        https://datadriven-rnd.com/lightgbm/
        https://kiroka-camp.com/lightgbm-optuna
        https://meknowledge.jpn.org/2021/05/28/lightgbm-optuna-tuning/
        https://knknkn.hatenablog.com/entry/2021/06/29/125226#learning_rate
        https://knknkn.hatenablog.com/entry/2021/06/14/160302
    '''
    mlflow.log_param('Const.hypara_cv_n_splits_lgbm', Const.hypara_cv_n_splits_lgbm)
    num_leaves = trial.suggest_int('num_leaves', 2, 1024)
    bagging_fraction = trial.suggest_float('bagging_fraction', 0.4, 1.0)
    feature_fraction = trial.suggest_float('feature_fraction', 0.4, 1.0)
    lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-8, 10.0)
    lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-8, 10.0)
    param = {
        'objective': 'regression',
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'num_leaves': num_leaves,
        'bagging_fraction': bagging_fraction,
        'feature_fraction': feature_fraction,
        'n_estimators': 10000,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
    }
    score = 0
    scores = []
    # gkf = GroupKFold(n_splits=Const.hypara_cv_n_splits_lgbm)
    gkf = KFold(n_splits=Const.hypara_cv_n_splits_lgbm)
    #for train_idx, valid_idx in gkf.split(X, Y, groups=train_df['City']):
    for train_idx, valid_idx in gkf.split(X):
        hypara_estimator = lgb.LGBMRegressor(**param)
        x_train, x_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
        hypara_estimator.fit(
            x_train, y_train,
            eval_set = [(x_train, y_train), (x_valid, y_valid)],
            early_stopping_rounds = 10,
            feature_name = features,
            verbose=1000
        )
        y_pred = hypara_estimator.predict(x_valid)
        score = np.sqrt(metrics.mean_squared_error(y_valid, y_pred))
        scores.append(score)
    score = np.array(scores).mean()
    return score        

def tuning_hypara_lgbm():
    study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0), direction='minimize')
    study.optimize(opt_lgbm, n_trials=100)
    bestparam = study.best_params
    return bestparam
    
# この部分は後ろの方に持ってきてもいいかも。
mlflow.log_param('Const.do_tuning_hypara_lgbm', Const.do_tuning_hypara_lgbm)
if Const.do_tuning_hypara_lgbm:
    bestparam = tuning_hypara_lgbm()
    print('bestparam =', bestparam)
    mlflow.log_param('bestparam', bestparam)

In [16]:
mlflow.log_param('Const.do_use_bestparam_lgbm', Const.do_use_bestparam_lgbm)
if Const.do_use_bestparam_lgbm:
    #bestparam = {'num_leaves': 503, 'bagging_fraction': 0.9936458663706589, 'feature_fraction': 0.43918252429106813, 'lambda_l1': 0.11197213132290361, 'lambda_l2': 3.940818887528048e-06}
    #bestparam = {'num_leaves': 379, 'bagging_fraction': 0.8925959379087611, 'feature_fraction': 0.4582607654758368, 'lambda_l1': 0.34793984646334997, 'lambda_l2': 7.326316408987164e-08}
    
    # KFoldに変更。Contry,monthのagg特徴量追加前。
    #bestparam = {'num_leaves': 327, 'bagging_fraction': 0.800446227978209, 'feature_fraction': 0.4790787174426353, 'lambda_l1': 0.027986258230304234, 'lambda_l2': 4.0239709475948286e-06}　# 

    # Contry,monthのagg特徴量追してハイパラサーチ。100回trialしたが、35回でkernelダウン。その中で最適値を選択。
    bestparam = {'num_leaves': 379, 'bagging_fraction': 0.8925959379087611, 'feature_fraction': 0.4582607654758368, 'lambda_l1': 0.34793984646334997, 'lambda_l2': 7.326316408987164e-08}
    pass
else:
    bestparam = None

In [17]:
print(bestparam)

None


In [18]:
def training_with_lgbm(X, Y, bestparam=None):
    '''
    LightGBMで学習。
    
    Args:
        X,
        Y,
        bestparams,
    Returns:
        models,
        
    '''
    mlflow.log_param('Const.cv_n_splits_lgbm', Const.cv_n_splits_lgbm)

    models = []
    cvscores = []
    y_valid_dfs = []
    y_valid_predict_dfs = []
    vis_valid_df = pd.DataFrame(columns=['id', 'year', 'month', 'day', 'Country' ,'City', 'pm25_mid', 'pm25_mid_predict'])

    #gkf = GroupKFold(n_splits=Const.cv_n_splits_lgbm)
    gkf = KFold(n_splits=Const.cv_n_splits_lgbm, random_state=42, shuffle=True)
    #for n_fold, (train_idx, valid_idx) in enumerate(gkf.split(X, Y, groups=train_df['City'])):
    for n_fold, (train_idx, valid_idx) in enumerate(gkf.split(X)):
        print('n_fold =', n_fold)

        if Const.do_use_bestparam_lgbm:
            model = lgb.LGBMRegressor(**bestparam)
            mlflow.log_param('bestparam', bestparam)
        else:
            param = {
                'objective': 'regression',
                'learning_rate': 0.05,
                'boosting_type': 'gbdt',
                'metric': 'rmse',
                'n_estimators': 10000,
                'lambda_l1': 1.0,
                'lambda_l2': 0.0,
            }
            model = lgb.LGBMRegressor(**param)
            mlflow.log_param('param', param)
        x_train, x_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]

        model.fit(
            x_train, y_train,
            eval_set = [(x_train, y_train), (x_valid, y_valid)],
            verbose=1000,
            early_stopping_rounds=10,    
        )

        valid_df = pd.DataFrame(columns=['id', 'year', 'month', 'day', 'Country' ,'City', 'pm25_mid', 'pm25_mid_predict'])
        valid_df['id'] = train_df.loc[valid_idx, 'id']
        valid_df['year'] = train_df.loc[valid_idx, 'year']
        valid_df['month'] = train_df.loc[valid_idx, 'month']
        valid_df['day'] = train_df.loc[valid_idx, 'day']
        valid_df['Country'] = train_df.loc[valid_idx, 'Country']
        valid_df['City'] = train_df.loc[valid_idx, 'City']

        y_valid_predict = model.predict(x_valid)
        cvscore = np.sqrt(metrics.mean_squared_error(y_valid, y_valid_predict))
        valid_df['pm25_mid'] = y_valid
        valid_df['pm25_mid_predict'] = y_valid_predict
        vis_valid_df = pd.concat([vis_valid_df, valid_df])
        models.append(model)
        cvscores.append(cvscore)
        
    return models, cvscores, vis_valid_df

In [19]:
X['Country']

0             Australia
1             Australia
2             Australia
3             Australia
4             Australia
              ...      
195936    United States
195937    United States
195938    United States
195939          Vietnam
195940          Vietnam
Name: Country, Length: 195941, dtype: category
Categories (30, object): ['Australia', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', ..., 'Turkey', 'United Kingdom', 'United States', 'Vietnam']

In [20]:
mlflow.log_param('Const.do_training_lgbm', Const.do_training_lgbm)
if Const.do_training_lgbm:
    models_lgbm, cvscores, vis_valid_df = training_with_lgbm(X, Y, bestparam)
    average = np.array(cvscores).mean()
    print('cvscores_lgbm =', cvscores)
    print('average =', average)
    for i in range(len(cvscores)):
        mlflow.log_metric(f'cvscores_lgbm{i}', cvscores[i])
    mlflow.log_metric('average', average)
    
    #average = 22.12377600122617 fold数= 20 learning_rate = 0.1
    #average = 22.023999283119117 fold数= 10 learning_rate = 0.05 ←　ベースラインとする。
    #average = 22.183219604203924 fold数= 20 learning_rate = 0.05
    # average = 21.895404967428846 fold= 10 learning_rate = 0.05 ハイパラtuningあり。

n_fold = 0




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 17.0204	valid_1's rmse: 19.4133
Early stopping, best iteration is:
[1128]	training's rmse: 16.7401	valid_1's rmse: 19.3706
n_fold = 1




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 17.0136	valid_1's rmse: 19.4991
Early stopping, best iteration is:
[1094]	training's rmse: 16.8072	valid_1's rmse: 19.4601
n_fold = 2




Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[988]	training's rmse: 17.0429	valid_1's rmse: 19.4097
n_fold = 3




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 17.0384	valid_1's rmse: 19.3859
Early stopping, best iteration is:
[1300]	training's rmse: 16.3841	valid_1's rmse: 19.2753
n_fold = 4




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 17.0205	valid_1's rmse: 19.2818
Early stopping, best iteration is:
[1593]	training's rmse: 15.8253	valid_1's rmse: 19.1087
n_fold = 5




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 17.0038	valid_1's rmse: 19.6445
Early stopping, best iteration is:
[1290]	training's rmse: 16.3832	valid_1's rmse: 19.5635
n_fold = 6




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 17.025	valid_1's rmse: 19.4003
Early stopping, best iteration is:
[1106]	training's rmse: 16.7828	valid_1's rmse: 19.3574
n_fold = 7




Training until validation scores don't improve for 10 rounds
[1000]	training's rmse: 16.9943	valid_1's rmse: 19.6692
Early stopping, best iteration is:
[1414]	training's rmse: 16.1469	valid_1's rmse: 19.528
n_fold = 8




Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[646]	training's rmse: 17.9574	valid_1's rmse: 19.5464
n_fold = 9




Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[903]	training's rmse: 17.2703	valid_1's rmse: 19.4685
cvscores_lgbm = [19.370598761366786, 19.460137896887712, 19.409679971584257, 19.27526875106481, 19.108663827559635, 19.563459633336613, 19.35737174025195, 19.527986871310404, 19.546391352612783, 19.468511503757963]
average = 19.408807030973286


In [21]:
# modelのpickle保存
if Const.do_save_lgbmmodel:
    savepath = f'models/lgbm/model_id_{run_id}.pickle'
    with open(savepath, 'wb') as f:
        pickle.dump(models_lgbm , f)

## 可視化

In [22]:
class Visualization:
    def __init__(self):
        pass
    
    def vis_result_varidation(self, df, Country, City):
        '''
        validationの結果の可視化。指定した国・都市のpm25_midの値について、
        validationの結果と真値を比較する。
        
        Args:
            df(pandas.DataFrame): 可視化対象のデータフレーム。
            Country(string): 国名
            City(string): 都市名
        '''
        cond = (df['Country'] == Country) & (df['City'] == City)
        df = df[cond]
        fig, ax = plt.subplots(1, 1, figsize=(30, 8))
        ax.set_title(f'{Country}, {City}')
        ax.plot(df['pm25_mid'])
        ax.plot(df['pm25_mid_predict'])
        plt.show()
        
    def vis_feature_importance(self, models):
        '''
        feature_importanceの可視化
        '''
        for model in models:
            lgb.plot_importance(model, figsize=(30, 40))
            plt.show()


In [23]:
v = Visualization()
# v.vis_result_varidation(vis_valid_df, 'Australia', 'Brisbane')
# v.vis_result_varidation(vis_valid_df, 'China', 'Beijing')
# v.vis_result_varidation(vis_valid_df, 'Japan', 'Yokohama')
# v.vis_feature_importance(models_lgbm)

## pseudo labeling

In [24]:
def pseudo_labeling(models, n_repeat=1, ):
    '''
    pseudo labelingを実施し、新しくmodelを作る。
    
    Args:
        models(list): pseudo labelingする前のモデル。
        n_repeat(int): pseudo labelingを実行する回数。
    Returns:
        models(list): pseudo labelingしたデータフレームで学習したモデル。

    '''
    # 普通に推論してy_pred出す。
    y_pred = 0
    for i in range(len(models)):
        x_test = test_df[features]
        y_pred += models[i].predict(x_test)
    y_pred /= len(models)    
    
    # 学習データに　x_test, y_predを付け加える。
    
    print('[y_pred]')
    display(y_pred)
    
    print('train_df[features]')
    display(train_df[features])
    print()
    print('test_df[features]')
    display(test_df[features])
    print()
    print('X')
    
    X = pd.DataFrame()
    X = pd.concat([train_df[features], test_df[features]]) # reset indexが必要。  
    
    display(X)
    

    assert False

    Y = pd.DataFrame()
    Y = pd.concat(train_df['pm25_mid'], )
    
    Y = train_df['pm25_mid']
    
    # 新しく作ったtrainに対して、cvしてmodelを作る。
    # (ココでもハイパラサーチ必要か？)
    
    
    # コレをn_repeat回繰り返す。
    # modelsを返す。
    

# pseudo_labeling(models)
    

## predict with LightGBM

In [25]:
def predict_lgbm(models, x_test):
    '''
    LightGBMで学習したモデルで推論する。
    '''
    y_pred = 0
    for i in range(len(models)):
        x_test = test_df[features]
        y_pred += models[i].predict(x_test)
    y_pred /= len(models)    
    return y_pred

In [26]:
mlflow.log_param('Const.do_predict_lgbm', Const.do_predict_lgbm)
if Const.do_predict_lgbm:
    x_test = test_df[features]
    y_pred_lgbm = predict_lgbm(models_lgbm, x_test)

---
## training with TabNet

In [27]:
# train_df = tabnet_preprocessing(train_df)
# test_df = tabnet_preprocessing(test_df)
# X = train_df[features]
# Y = train_df['pm25_mid']

# train
X = train_df[features]
X = tabnet_preprocessing(X)
Y = train_df['pm25_mid']


Country


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = l_enc.fit_transform(df[col].values)


In [28]:
def opt_tabnet(trial):
    pass

In [29]:
mlflow.log_param('Const.do_use_bestparam_tabnet', Const.do_use_bestparam_tabnet)
if Const.do_use_bestparam_lgbm:
    bestparam = None
else:
    bestparam = None

In [30]:
def pretraining(X, Y):
    '''
    事前学習
    '''
    random_state = 0
    x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.3, random_state=random_state)
    x_train = x_train.values.astype(np.float64)
    x_valid = x_valid.values.astype(np.float64)
    
    unsupervised_model = TabNetPretrainer(
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        mask_type='entmax' # "sparsemax"
    )
    unsupervised_model.fit(
        X_train=x_train,
        eval_set=[x_valid], 
        pretraining_ratio=0.8,
    )
    return unsupervised_model


In [31]:
mlflow.log_param('Const.do_pretraining', Const.do_pretraining)
if Const.do_pretraining:
    unsupervised_model = pretraining(X, Y)

In [32]:
def training_with_tabnet(X, Y, bestparam=None, from_unsupervised=None):
    models = []
    cvscores = []
    y_valid_dfs = []
    y_valid_predict_dfs = []
    vis_valid_df = pd.DataFrame(columns=['id', 'year', 'month', 'day', 'Country' ,'City', 'pm25_mid', 'pm25_mid_predict'])
    
    mlflow.log_param('Const.cv_n_splits_tabnet', Const.cv_n_splits_tabnet)
    # gkf = GroupKFold(n_splits=Const.cv_n_splits_tabnet)
    gkf = KFold(n_splits=Const.cv_n_splits_tabnet, random_state=40, shuffle=True)
    #for n_fold, (train_idx, valid_idx) in enumerate(gkf.split(X, Y, groups=train_df['City'])):
    for n_fold, (train_idx, valid_idx) in enumerate(gkf.split(X)):
        print('n_fold =', n_fold)

        if Const.do_use_bestparam_tabnet:
            model = TabNetRegressor()
        else:
            tabnet_params = {
                'optimizer_fn': torch.optim.Adam, 
                'optimizer_params': {'lr': 2e-2},
                'n_steps': 3,
                'scheduler_params': {'step_size': 10, 'gamma': 0.9}, 
                'scheduler_fn': torch.optim.lr_scheduler.StepLR,
            }
            mlflow.log_param('tabnet_params', tabnet_params)
            model = TabNetRegressor(**tabnet_params)
        x_train, x_valid = X.iloc[train_idx].values, X.iloc[valid_idx].values
        y_train, y_valid = Y.iloc[train_idx].values, Y.iloc[valid_idx].values

        x_train = x_train.astype(np.float64)
        x_valid = x_valid.astype(np.float64)
        y_train = y_train.reshape(-1, 1).astype(np.float64)
        y_valid = y_valid.reshape(-1, 1).astype(np.float64)

        model.fit(
            x_train, y_train,
            eval_set = [(x_train, y_train), (x_valid, y_valid)],
            eval_metric=['rmse'],
            from_unsupervised=unsupervised_model
        )

        valid_df = pd.DataFrame(columns=['id', 'year', 'month', 'day', 'Country' ,'City', 'pm25_mid', 'pm25_mid_predict'])
        valid_df['id'] = train_df.loc[valid_idx, 'id']
        valid_df['year'] = train_df.loc[valid_idx, 'year']
        valid_df['month'] = train_df.loc[valid_idx, 'month']
        valid_df['day'] = train_df.loc[valid_idx, 'day']
        valid_df['Country'] = train_df.loc[valid_idx, 'Country']
        valid_df['City'] = train_df.loc[valid_idx, 'City']

        y_valid_predict = model.predict(x_valid)
        cvscore = np.sqrt(metrics.mean_squared_error(y_valid, y_valid_predict))
        valid_df['pm25_mid'] = y_valid
        valid_df['pm25_mid_predict'] = y_valid_predict    
        vis_valid_df = pd.concat([vis_valid_df, valid_df])
        models.append(model)
        cvscores.append(cvscore)
    
    return models, cvscores, vis_valid_df

In [33]:
mlflow.log_param('Const.do_training_tabnet', Const.do_training_tabnet)
if Const.do_training_tabnet:
    if Const.do_pretraining:
        models_tabnet, cvscores, vis_valid_df = training_with_tabnet(X, Y, bestparam, unsupervised_model)
    else:
        models_tabnet, cvscores, vis_valid_df = training_with_tabnet(X, Y, bestparam)
    average = np.array(cvscores).mean()
    print('cvscores_tabnet =', cvscores)
    print('average =', average)
    for i in range(len(cvscores)):
        mlflow.log_metric(f'cvscores_tabnet{i}', cvscores[i])
    mlflow.log_metric('average', average)
    

In [34]:
# tabnetでのFeature importance
def feature_importance_tabnet(models):
    for model in models:
        display(model.feature_importances_)

# feature_importance_tabnet(models_tabnet)

In [35]:
# modelのpickle保存
if Const.do_save_tabnetmodel:
    savepath = f'models/tabnet/model_id_{run_id}.pickle'
    with open(savepath, 'wb') as f:
        pickle.dump(models_tabnet , f)

## predict with TabNet

In [36]:
def predict_tabnet(models, x_test):
    '''
    LightGBMで学習したモデルで推論する。
    
    Args:
        models(list): fold数分のmodel
        x_test(numpy.array): 推論対象のデータ。numpy形式かつ、dtype=np.float64とする。 
    '''
    y_pred = 0
    for i in range(len(models)):
        y_pred += models[i].predict(x_test)
    y_pred /= len(models)    
    return y_pred

In [37]:
mlflow.log_param('Const.do_predict_tabnet', Const.do_predict_tabnet)
if Const.do_predict_tabnet:
    x_test = tabnet_preprocessing(test_df[features]).values.astype(np.float64)
    y_pred_tabnet = predict_tabnet(models_tabnet, x_test)
    display(y_pred_tabnet)

In [38]:
# y_pred_tabnet
# array([[21.556034],
#        [45.753437],
#        [33.175117],
#        ...,
#        [86.51942 ],
#        [36.285927],
#        [41.747646]], dtype=float32)

---
## Ensemble

In [39]:
def ensemble(y_pred_lgbm=None, y_pred_tabnet=None, weight_lgbm=None, weight_tabnet=None):
    '''
    '''
    # if Const.do_predict_lgbm and Const.do_predict_tabnet:
    #     y_pred_tabnet = y_pred_tabnet.reshape(-1)
    #     y_pred_ensemble = weight_lgbm*y_pred_lgbm + weight_tabnet*y_pred_tabnet
    # if Const.do_predict_lgbm and not Const.do_predict_tabnet:
    #     y_pred_ensemble = y_pred_lgbm
    # if not Const.do_predict_lgbm and Const.do_predict_tabnet:
    #     y_pred_tabnet = y_pred_tabnet.reshape(-1)
    #     y_pred_ensemble = y_pred_tabnet
        
    if y_pred_lgbm is not None and y_pred_tabnet is not None:
        y_pred_tabnet = y_pred_tabnet.reshape(-1)
        y_pred_ensemble = weight_lgbm*y_pred_lgbm + weight_tabnet*y_pred_tabnet    
    if y_pred_lgbm is not None and y_pred_tabnet is None:
        y_pred_ensemble = y_pred_lgbm
    if y_pred_lgbm is None and y_pred_tabnet is not None:
        y_pred_tabnet = y_pred_tabnet.reshape(-1)
        y_pred_ensemble = y_pred_tabnet        
    return y_pred_ensemble

In [40]:
if Const.do_predict_lgbm or Const.do_predict_tabnet:
    mlflow.log_param('Const.weight_lgbm', Const.weight_lgbm)
    mlflow.log_param('Const.weight_tabnet', Const.weight_tabnet)
    y_pred_ensemble = ensemble(
        y_pred_lgbm = y_pred_lgbm, 
        y_pred_tabnet = y_pred_tabnet,
        weight_lgbm = Const.weight_lgbm,
        weight_tabnet = Const.weight_tabnet
    )
    display(y_pred_ensemble)

array([18.37494306, 36.42463268, 26.50811286, ..., 65.03478816,
       35.72041752, 38.73329297])

In [41]:
Const.do_predict_lgbm

True

## submission

In [42]:
if Const.do_submit:
    submission_df = pd.DataFrame(columns=['id', 'pm25_mid'])
    submission_df['id'] = test_df['id'].copy()
    submission_df['pm25_mid'] = y_pred_ensemble
    display(submission_df)
    
    if Const.do_predict_lgbm and Const.do_predict_tabnet:
        save_filename = f'02_forSubmission/prediction_ensemble_No_{run_id}.csv'
    if Const.do_predict_lgbm and not Const.do_predict_tabnet:
        save_filename = f'02_forSubmission/prediction_lgbm_No_{run_id}.csv'
    if not Const.do_predict_lgbm and Const.do_predict_tabnet:
        save_filename = f'02_forSubmission/prediction_tabnet_No_{run_id}.csv'
    
    submission_df.to_csv(save_filename, header=False, index=False)
    mlflow.log_artifact(save_filename)

Unnamed: 0,id,pm25_mid
195941,195942,18.374943
195942,195943,36.424633
195943,195944,26.508113
195944,195945,63.742432
195945,195946,129.244098
...,...,...
249445,249446,70.990942
249446,249447,97.478673
249447,249448,65.034788
249448,249449,35.720418


In [43]:
mlflow.end_run()

## memo

- `Country`及び`City`をcategoricalにして特徴量に入れる場合。
    - cvscores = [31.168381258519616, 23.62248117843837, 25.045370727445352, 24.926670022160398, 26.691971608354752]
    - average = 26.290974958983696
- `Country`を特徴量に入れ、`City`は特徴量に入れない場合。
    - cvscores = [24.50825768668205, 21.316029448606567, 22.10093074317413, 21.912565563021566, 22.08565642883814]
    - average = 22.384687974064487
- `Country`及び`City`を共に特徴量に入れない場合。
    - cvscores = [25.003064401106673, 22.049348169514104, 21.934097643562655, 22.29729700536064, 22.38822735801613]
    - average = 22.73440691551204

- `City`とcnt系を全て削除した場合。
    - cvscores = [24.23733196068929, 21.123747159455178, 22.13491161397876, 21.832330076220718, 22.229381039283844]
    - average = 22.31154036992556
- 更に、`year`を`int64`から`category`に変換した。
    - cvscores = [24.36687527029628, 21.106846159060602, 22.053054338444102, 21.826410103971394, 22.33575695373084]
    - average = 22.337788565100645


## ideas & todos

- ぞれぞれのcntをどうやって使うか？
- monthとdayは通算日を使ったほうがいい？
- yearはint64とせず、categoricalとしたほうがいい。-> あんまり変わらん。
- cntは消さない方がいいかも。
- 年を通算日とした。
- pseudo labelingの勉強と、お試し。
- pytorch lightningの結果とensamble
- cvスコアが22を切り出すと、MLflowで実験管理をする。
- Feature importanceの確認
- optunaでハイパラのベイズ最適化
- tabnetを使ってみる。
- tabnetが使えそうであれば、ensamble
- ケッペンの気候区分の特徴量を入れてみる。

In [44]:
print(y_pred_lgbm)
print(y_pred_tabnet)
print(y_pred_ensemble)

[18.37494306 36.42463268 26.50811286 ... 65.03478816 35.72041752
 38.73329297]
None
[18.37494306 36.42463268 26.50811286 ... 65.03478816 35.72041752
 38.73329297]
