In [1]:
# masked pseudo labeling [preprocess]

In [2]:
from google.colab import drive, files
import os

drive.mount('/content/drive')  # drive をマウント
COLAB = "/content/drive/MyDrive/studentcup-2021-spring"  # colaboratory の path (必要時応じて変更)
os.chdir(COLAB)
!pip install --quiet category_encoders
!pip install --quiet xfeat

Mounted at /content/drive
[K     |████████████████████████████████| 81kB 3.4MB/s 
[K     |████████████████████████████████| 296kB 8.0MB/s 
[K     |████████████████████████████████| 81kB 4.9MB/s 
[K     |████████████████████████████████| 1.2MB 8.5MB/s 
[K     |████████████████████████████████| 112kB 22.4MB/s 
[K     |████████████████████████████████| 143kB 13.0MB/s 
[K     |████████████████████████████████| 51kB 4.1MB/s 
[K     |████████████████████████████████| 81kB 6.2MB/s 
[?25h  Building wheel for ml-metrics (setup.py) ... [?25l[?25hdone
  Building wheel for alembic (setup.py) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone


In [3]:
import requests
import os

# make config
OUTPUT = os.path.join(COLAB, 'output')
INPUT = os.path.join(COLAB, 'input')
SUBMISSION = os.path.join(COLAB, 'submission')
EXP_NAME = "run2"
EXP = os.path.join(OUTPUT, EXP_NAME)
PREDS = os.path.join(EXP, "preds")
TRAINED = os.path.join(EXP, "trained")
FEATURE = os.path.join(EXP, "feature")
REPORTS = os.path.join(EXP, "reports")

# make experiments environment
dirs = [OUTPUT,
        SUBMISSION,
        FEATURE,
        EXP,
        PREDS,
        TRAINED,
        REPORTS]

for v in dirs:
    if not os.path.isdir(v):
        print(f"making {v}")
        os.makedirs(v)

making /content/drive/MyDrive/studentcup-2021-spring/output/run2/feature
making /content/drive/MyDrive/studentcup-2021-spring/output/run2/preds
making /content/drive/MyDrive/studentcup-2021-spring/output/run2/trained
making /content/drive/MyDrive/studentcup-2021-spring/output/run2/reports


In [4]:
import datetime
import logging
import random

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import joblib
from matplotlib_venn import venn2
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

import itertools

from lightgbm import LGBMModel
import category_encoders as ce
import xfeat

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras import layers as L
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



In [5]:
# seed 固定
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(46)

In [6]:
# model save and load 用のクラス
class Util:
    @classmethod
    def dump(cls, value, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(value, path, compress=True)

    @classmethod
    def load(cls, path):
        return joblib.load(path)

# log 用のクラス
class Logger:
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

# logger の設定
logger = Logger(REPORTS)


## laod data - pseudo labeling -

In [7]:
train = pd.read_csv(INPUT+"/train.csv")
test = pd.read_csv(INPUT+"/test.csv")
sample_sub = pd.read_csv(INPUT+"/sample_submit.csv")
genre_labels = pd.read_csv(INPUT+"/genre_labels.csv")

In [8]:
# pseudo labeling augmentation
threshold = 0.7

path_1 =  "output/run1/preds"
preds_1 = Util.load(path_1 + "/preds.pkl")

pseudo = test.copy()
pseudo["genre"] = np.argmax(preds_1, axis=1)
pseudo["proba"] = np.max(preds_1, axis=1)

# cross pseudo labeling 用の flag
pseudo["flag"] = 1
train["flag"] = 0

pseudo = pseudo[pseudo["proba"] >= threshold][train.columns]

# augment
old = len(train)
train = pd.concat([train, pseudo]).reset_index(drop=True)
new = len(train)

logger.info(f"Psudo Labeling : {old} -> {new}")

[2021-05-09 05:13:27] - Psudo Labeling : 4046 -> 6624


## feature engineering

In [9]:
class GroupingEngine: 

    def __init__(self, group_key, group_values, agg_methods):
        self.group_key = group_key
        self.group_values = group_values  # debug

        ex_trans_methods = ["val-mean", "z-score"]
        self.ex_trans_methods = [m for m in agg_methods if m in ex_trans_methods]
        self.agg_methods = [m for m in agg_methods if m not in self.ex_trans_methods]
        self.df = None

    def fit(self, input_df, y=None):
        if not self.agg_methods:
            return
            
        new_df = []
        for agg_method in self.agg_methods:

            for col in self.group_values:
                if callable(agg_method):
                    agg_method_name = agg_method.__name__
                else:
                    agg_method_name = agg_method

                new_col = f"agg_{agg_method_name}_{col}_grpby_{self.group_key}"
                df_agg = (input_df[[col] + [self.group_key]].groupby(self.group_key)[[col]].agg(agg_method))
                df_agg.columns = [new_col]
                new_df.append(df_agg)
        self.df = pd.concat(new_df, axis=1).reset_index()

    def transform(self, input_df):
        if self.agg_methods:
            output_df = pd.merge(input_df[[self.group_key]], self.df, on=self.group_key, how="left")
        else:
            output_df = input_df[[self.group_key]].copy()

        if len(self.ex_trans_methods) != 0:
            output_df = self.ex_transform(input_df, output_df)
        output_df.drop(self.group_key, axis=1, inplace=True)
        return output_df

    def ex_transform(self, df1, df2):
        """
        df1: input_df
        df2: output_df
        return: output_df (added ex transformed features)
        """

        if "val-mean" in self.ex_trans_methods:
            _agg_df = xfeat.aggregation(df1, 
                                        group_key=self.group_key,
                                        group_values=self.group_values, 
                                        agg_methods=["mean"])[0]
            df2[self._get_col("val-mean")] = df1[self.group_values].values - _agg_df[self._get_col("mean")].values

        if "z-score" in self.ex_trans_methods:
            _agg_df = xfeat.aggregation(df1, 
                                        group_key=self.group_key,
                                        group_values=self.group_values, 
                                        agg_methods=["mean", "std"])[0]
            df2[self._get_col("z-score")] = ((df1[self.group_values].values - _agg_df[self._get_col("mean")].values) 
                                                / (_agg_df[self._get_col("std")].values + 1e-8))

        return df2

    def _get_col(self, method):
        return [f"agg_{method}_{group_val}_grpby_{self.group_key}" for group_val in self.group_values]

    def fit_transform(self, input_df, y=None):
        self.fit(input_df, y=y)
        return self.transform(input_df)



class TargetEncodingEngine:
    """
    refer to https://github.com/nyk510/atmacup10
    """

    def __init__(self, use_columns, cv):

        self.mapping_df_ = None
        self.y_mean_ = None
        self.use_columns = use_columns
        self.cv = list(cv)
        self.n_fold = len(self.cv)

    def create_mapping(self, input_df, y):
        self.mapping_df_ = {}
        self.y_mean_ = np.mean(y)

        out_df = pd.DataFrame()
        target = pd.Series(y)

        for col_name in self.use_columns:
            keys = input_df[col_name].unique()
            x = input_df[col_name]

            oof = np.zeros_like(x, dtype=np.float)

            for idx_train, idx_valid in self.cv:
                _df = target[idx_train].groupby(x[idx_train]).mean()
                _df = _df.reindex(keys)
                _df = _df.fillna(_df.mean())
                oof[idx_valid] = input_df[col_name][idx_valid].map(_df.to_dict())

            out_df[col_name] = oof

            self.mapping_df_[col_name] = target.groupby(x).mean()

        return out_df

    def fit(self, input_df: pd.DataFrame, y=None, **kwargs) -> None:
        _ = self.create_mapping(input_df, y=y)

    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        out_df = pd.DataFrame()

        for c in self.use_columns:
            out_df[c] = input_df[c].map(self.mapping_df_[c]).fillna(self.y_mean_)

        return out_df.add_prefix('TE_') 

    
    def fit_transform(self, input_df, y=None):
        self.fit(input_df, y=y)
        return self.transform(input_df)   

In [10]:
# 実際に前処理をする関数を定義
def get_numerical_features(input_df):
    # そのままの数値特徴
    cols = ['popularity',
            'duration_ms',
            'acousticness',
            'positiveness',
            'danceability',
            'loudness',
            'energy',
            'liveness',
            'speechiness',
            'instrumentalness']
    output_df = input_df[cols].copy()
    return output_df.add_prefix("lgb__")


def get_ce_features(input_df):
    # count encording した特徴量
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = input_df.copy()
    _input_df["pop10region"] = tmp + input_df["region"]

    cols = ["region", "pop10region"]
    encoder = ce.CountEncoder()
    output_df = encoder.fit_transform(_input_df[cols]).add_prefix("CE_")
    return output_df.add_prefix("lgb__")


def get_oe_features(input_df):
    # ordinal encording (label encording)した特徴量
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = input_df.copy()
    _input_df["pop10region"] = tmp + input_df["region"]
    cols = ["region", "pop10region"]
    encoder = ce.OrdinalEncoder()
    output_df = encoder.fit_transform(_input_df[cols]).add_prefix("OE_")
    return output_df.add_prefix("lgb__")


def get_tmpo_features(input_df):
    # tmpo に関する特徴量
    _df = input_df["tempo"].str.split("-").apply(pd.Series).astype(float)
    _df.columns = ["tempo_low", "tempo_high"]
    output_df = _df.copy()
    output_df["diff_tempo"] = _df["tempo_high"] - _df["tempo_low"]
    return output_df.add_prefix("lgb__")


def get_binned_popularity_features(input_df):
    # popularity の10の位と1の位の特徴量
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = [[i[0], i[1]] for i in tmp]
    output_df = pd.DataFrame(tmp, columns=["popularity10", "popularity01"])
    return output_df.astype(int).add_prefix("lgb__")

# 集約特徴量作成時に使用
def max_min(x):
    return x.max() - x.min()

def q75_q25(x):
    return x.quantile(0.75) - x.quantile(0.25)


def get_agg_region_features(input_df):
    # region をキーにした集約特徴量
    _input_df = pd.concat([get_tmpo_features(input_df),
                           input_df], axis=1)
    group_key = "region"
    group_values = ['popularity',
                    'duration_ms',
                    'acousticness',
                    'positiveness',
                    'danceability',
                    'loudness',
                    'energy',
                    'liveness',
                    'speechiness',
                    'instrumentalness', 
                    ]
    agg_methods = ["min", "mean", "max", max_min, "z-score", "var", "skew", pd.DataFrame.kurt]
    encoder = GroupingEngine(group_key=group_key, group_values=group_values, agg_methods=agg_methods)
    output_df = encoder.fit_transform(_input_df)
    return output_df.add_prefix("lgb__")


def get_agg_pop10region_features(input_df):
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = pd.concat([input_df, 
                           get_tmpo_features(input_df)], axis=1)
    _input_df["pop10region"] = tmp + input_df["region"]
    group_key = "pop10region"
    group_values = ['popularity',
                    'duration_ms',
                    'acousticness',
                    'positiveness',
                    'danceability',
                    'loudness',
                    'energy',
                    'liveness',
                    'speechiness',
                    'instrumentalness',
                    ]
    agg_methods = ["min", "mean", "max", max_min, "z-score", "var", "skew", pd.DataFrame.kurt]
    encoder = GroupingEngine(group_key=group_key, group_values=group_values, agg_methods=agg_methods)
    output_df = encoder.fit_transform(_input_df)
    return output_df.add_prefix("lgb__")

def get_num_nan_features(input_df):
    output_df = pd.DataFrame()
    output_df["num_nan"] = input_df.isnull().sum(axis=1)
    return output_df.add_prefix("lgb__")


def get_target_encode_features(input_df):
    kf = model_selection.KFold(n_splits=10, random_state=2021, shuffle=True)
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = input_df.copy()
    _input_df["pop10region"] = tmp + input_df["region"]

    train_df = _input_df[_input_df["genre"].notnull()]
    train_y = ce.OneHotEncoder().fit_transform(train_df["genre"].astype(str))
    genre = ['country',
             'electronic',
             'folk',
             'hip-hop',
             'jazz',
             'latin',
             'classic',
             'other-light-music',
             'pop',
             'religious',
             'rock']
    train_y.columns = genre
    out_lst = []
    for col in genre:
        _y = train_y[col]
        encoder = TargetEncodingEngine(use_columns=["region", "pop10region"],
                                       cv=kf.split(train_df, _y))
        encoder.fit(input_df=train_df, y=_y)

        out_df = encoder.transform(_input_df).add_suffix(f"={col}")
        out_lst.append(out_df)
    
    output_df = pd.concat(out_lst, axis=1)
    return output_df.add_prefix("lgb__")


In [11]:
# kNN features (nagiss's features)
def get_knn_numerical_features(input_df):
    cols = ['popularity',
            'duration_ms',
            'acousticness',
            'positiveness',
            'danceability',
            'loudness',
            'energy',
            'liveness',
            'speechiness',
            'instrumentalness']

    output_df = input_df[cols + ["region"]]
    f = lambda x: x.fillna(x.mean())
    output_df = output_df.groupby('region').transform(f)

    output_df = pd.DataFrame(StandardScaler().fit_transform(output_df), columns=cols)
    output_df["popularity8"] = output_df["popularity"] * 8
    return output_df.add_prefix("kNN__")


def get_knn_ohe_features(input_df):
    # ordinal encording (label encording)した特徴量

    cols = ["region"]
    encoder = ce.OneHotEncoder()
    output_df = encoder.fit_transform(input_df[cols]).add_prefix("OHE_")* 100
    return output_df.add_prefix("kNN__") 


def get_knn_tmpo_features(input_df):
    # tmpo に関する特徴量
    _df = input_df["tempo"].str.split("-").apply(pd.Series).astype(float)
    _df.columns = ["tempo_low", "tempo_high"]

    _df["region"] = input_df["region"].copy()
    f = lambda x: x.fillna(x.mean())
    _df = _df.groupby('region').transform(f)

    output_df = np.log1p(_df)
    return output_df.add_prefix("kNN__") * 0.01


def get_knn_num_nan_features(input_df):
    output_df = pd.DataFrame()
    output_df["num_nan"] = input_df.drop("genre", axis=1).isnull().sum(axis=1)
    return output_df.add_prefix("kNN__") * 100


def get_knn_target_encode_features(input_df):
    kf = model_selection.KFold(n_splits=10, random_state=2021, shuffle=True)
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = input_df.copy()
    _input_df["pop10region"] = tmp + input_df["region"]

    train_df = _input_df[_input_df["genre"].notnull()]
    train_y = ce.OneHotEncoder().fit_transform(train_df["genre"].astype(str))
    genre = ['country',
             'electronic',
             'folk',
             'hip-hop',
             'jazz',
             'latin',
             'classic',
             'other-light-music',
             'pop',
             'religious',
             'rock']
    train_y.columns = genre
    out_lst = []
    for col in genre:
        _y = train_y[col]
        encoder = TargetEncodingEngine(use_columns=["pop10region"],
                                       cv=kf.split(train_df, _y))
        encoder.fit(input_df=train_df, y=_y)

        out_df = encoder.transform(_input_df).add_suffix(f"={col}")
        out_lst.append(out_df)
    
    output_df = pd.concat(out_lst, axis=1).fillna(0)
    return output_df.add_prefix("kNN__")

In [12]:
# MLP
def get_mlp_numerical_features(input_df):
    # そのままの数値特徴
    cols = ['popularity',
            'duration_ms',
            'acousticness',
            'positiveness',
            'danceability',
            'loudness',
            'energy',
            'liveness',
            'speechiness',
            'instrumentalness']
    output_df = input_df[cols].fillna(0).copy()
    return output_df.add_prefix("MLP__")


def get_mlp_ce_features(input_df):
    # count encording した特徴量
    _input_df = pd.concat([input_df,
                           get_binned_popularity_features(input_df)], axis=1).fillna(0)
    
    cols = ["region"]
    encoder = ce.CountEncoder()
    output_df = encoder.fit_transform(_input_df[cols]).add_prefix("CE_")
    return output_df.add_prefix("MLP__")


def get_mlp_ohe_features(input_df):
    # ordinal encording (label encording)した特徴量
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = input_df.fillna(0).copy()
    _input_df["pop10regeion"] = tmp + input_df["region"]
    cols = ["region", 
            "pop10regeion"
            ]
    encoder = ce.OneHotEncoder()
    output_df = encoder.fit_transform(_input_df[cols]).add_prefix("OHE_")
    return output_df.add_prefix("MLP__")


def get_mlp_tmpo_features(input_df):
    # tmpo に関する特徴量
    _df = input_df["tempo"].str.split("-").apply(pd.Series).astype(float)
    _df.columns = ["tempo_low", "tempo_high"]
    output_df = _df.fillna(0).copy()
    output_df["diff_tempo"] = _df["tempo_high"] - _df["tempo_low"]
    return output_df.add_prefix("MLP__")


def get_mlp_binned_popularity_features(input_df):
    # popularity の10の位と1の位の特徴量
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = [[i[0], i[1]] for i in tmp]
    output_df = pd.DataFrame(tmp, columns=["popularity10", "popularity01"])
    return output_df.astype(int).add_prefix("MLP__")

# 集約特徴量作成時に使用
def max_min(x):
    return x.max() - x.min()

def q75_q25(x):
    return x.quantile(0.75) - x.quantile(0.25)


def get_mlp_agg_region_features(input_df):
    # region をキーにした集約特徴量
    _input_df = pd.concat([
                           get_mlp_tmpo_features(input_df),
                           input_df], axis=1)
    group_key = "region"
    group_values = ['popularity',
                    'duration_ms',
                    'acousticness',
                    'positiveness',
                    'danceability',
                    'loudness',
                    'energy',
                    'liveness',
                    'speechiness',
                    'instrumentalness']
    agg_methods = ["z-score",]
    encoder = GroupingEngine(group_key=group_key, group_values=group_values, agg_methods=agg_methods)
    output_df = encoder.fit_transform(_input_df).fillna(0)
    return output_df.add_prefix("MLP__")


def get_mlp_agg_popularity10_region_features(input_df):
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = pd.concat([input_df, get_mlp_tmpo_features(input_df)], axis=1)
    _input_df["pop10regeion"] = tmp + input_df["region"]
    group_key = "pop10regeion"
    group_values = ['duration_ms',
                    'acousticness',
                    'positiveness',
                    'danceability',
                    'loudness',
                    'energy',
                    'liveness',
                    'speechiness',
                    'instrumentalness']
    agg_methods = ["z-score",]
    encoder = GroupingEngine(group_key=group_key, group_values=group_values, agg_methods=agg_methods)
    output_df = encoder.fit_transform(_input_df).fillna(0)
    return output_df.add_prefix("MLP__")


def get_mlp_target_encode_features(input_df):
    kf = model_selection.KFold(n_splits=10, random_state=2021, shuffle=True)
    tmp = input_df["popularity"].astype(str).str.zfill(2)
    tmp = pd.Series([i[0] for i in tmp])
    _input_df = input_df.copy()
    _input_df["pop10region"] = tmp + input_df["region"]

    train_df = _input_df[_input_df["genre"].notnull()]
    train_y = ce.OneHotEncoder().fit_transform(train_df["genre"].astype(str))
    genre = ['country',
             'electronic',
             'folk',
             'hip-hop',
             'jazz',
             'latin',
             'classic',
             'other-light-music',
             'pop',
             'religious',
             'rock']
    train_y.columns = genre
    out_lst = []
    for col in genre:
        _y = train_y[col]
        encoder = TargetEncodingEngine(use_columns=["region", "pop10region"],
                                       cv=kf.split(train_df, _y))
        encoder.fit(input_df=train_df, y=_y)

        out_df = encoder.transform(_input_df).add_suffix(f"={col}")
        out_lst.append(out_df)
    
    output_df = pd.concat(out_lst, axis=1).fillna(0)
    return output_df.add_prefix("MLP__")


def get_cross_pseudo_features(input_df):
    return input_df[["index", "flag"]].copy()

In [13]:
# 上で作った関数を実行し、train, test それぞれで前処理を施す関数を定義: get_train_data, get_test_data
def preprocess(input_df, funcs, task="train"):
    df_lst = []
    for func in funcs:
        file_name = os.path.join(FEATURE, f"{task}_{func.__name__}.pkl")
        if os.path.isfile(file_name):
            _df = Util.load(file_name)
        else:
            _df = func(input_df)
            Util.dump(_df, file_name)
        df_lst.append(_df)
    output_df = pd.concat(df_lst, axis=1)  
    return output_df


def get_train_data(train, test):
    # whole_funcs: train+test の全体集合を対象とした処理
    whole_funcs = [get_numerical_features,
                   get_tmpo_features,
                   get_binned_popularity_features,get_ce_features, 
                   get_oe_features,
                   get_agg_region_features,
                   get_agg_pop10region_features,
                   get_num_nan_features,
                   get_target_encode_features,
                   get_knn_numerical_features, 
                   get_knn_ohe_features,
                   get_knn_tmpo_features,
                   get_knn_num_nan_features,
                   get_knn_target_encode_features,
                   get_mlp_numerical_features,
                   get_mlp_tmpo_features,
                   get_mlp_binned_popularity_features,
                   get_mlp_ce_features,
                   get_mlp_ohe_features,
                   get_mlp_agg_region_features,
                   get_mlp_agg_popularity10_region_features,
                   get_mlp_target_encode_features,
                   get_cross_pseudo_features]

    whole_df = pd.concat([train, test]).reset_index(drop=True)
    whole_out = preprocess(whole_df, whole_funcs, task="whole_")  # whole funcs による前処理

    train_x = whole_out.iloc[:len(train)]
    
    return train_x     


def get_test_data(train, test):

    # whole_funcs: train+test の全体集合を対象とした処理
    whole_funcs = [get_numerical_features,
                   get_tmpo_features,
                   get_binned_popularity_features,get_ce_features, 
                   get_oe_features,
                   get_agg_region_features,
                   get_agg_pop10region_features,
                   get_num_nan_features,
                   get_target_encode_features,
                   get_knn_numerical_features, 
                   get_knn_ohe_features,
                   get_knn_tmpo_features,
                   get_knn_num_nan_features,
                   get_knn_target_encode_features,
                   get_mlp_numerical_features,
                   get_mlp_tmpo_features,
                   get_mlp_binned_popularity_features,
                   get_mlp_ce_features,
                   get_mlp_ohe_features,
                   get_mlp_agg_region_features,
                   get_mlp_agg_popularity10_region_features,
                   get_mlp_target_encode_features,
                   get_cross_pseudo_features]

    whole_df = pd.concat([train, test]).reset_index(drop=True)
    whole_out = preprocess(whole_df, whole_funcs, task="whole_")  # whole funcs による前処理

    test_x = whole_out.iloc[len(train):].reset_index(drop=True)
    
    return test_x     

In [14]:
# get features
train_x = get_train_data(train, test)
test_x = get_test_data(train, test)
train_y = train["genre"]

print(train_x.shape)

if train_x.shape[1] != test_x.shape[1]:
    raise Exception("Not much number of features")


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead



(6624, 464)
