<a href="https://colab.research.google.com/github/namwootree/Portfolio/blob/main/Competition/Kaggle/Scrabble%20Player%20Rating/CV_Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

# install

In [None]:
!pip install textstat

In [None]:
!pip install category_encoders

In [None]:
!pip install optuna

In [None]:
!pip install catboost

# Library

In [None]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
import os

import textstat

from sklearn.model_selection import cross_validate, KFold, RepeatedKFold
import category_encoders as ce

import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping, Dataset
from catboost import CatBoostRegressor
from catboost import Pool, cv

from sklearn.svm import LinearSVR

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

# Load Data

## Google Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Unzip File

In [None]:
!unzip -qq '/content/drive/MyDrive/머신러닝 엔지니어링/Kaggle/Scrabble Player Rating/data/scrabble-player-rating.zip'

## Load Train / Test / Games / Tunrs Data

### Preprocessing Turns Data

In [None]:
def create_turn_features(df):

    # 타일 관련 변수 전처리
    df["rack_len"] = df["rack"].str.len() # rack (남은 타일)의 개수
    df["rack_len_less_than_7"] = df["rack_len"].apply(lambda x : x <7) # rack의 개수가 6개 이하 인 경우
    df["move_len"] = df["move"].str.len() # 배치한 타일 개수
    df["move"].fillna("None",inplace=True) # 결측치 처리
    # Dale–Chall readability formula 기준 어려운 단어
    df["difficult_word"] = df["move"].apply(textstat.difficult_words)
    
    # 해당 턴에 한 플레이 변수 전처리
    df["turn_type"].fillna("None",inplace=True) # 결측치 처리
    turn_type_unique = df["turn_type"].unique()
    df = pd.get_dummies(df, columns=["turn_type"]) # # 'turn_type' 변수 더미화

    # 더미화한 변수
    dummy_features = [f"turn_type_{value}" for value in turn_type_unique]
    
    # 타일 위치 관련 변수 전처리
    df['y'] = df["location"].str.extract('(\d+)')[0].values # 숫자만 (Y축 정보) 추출
    df['y'].fillna("0",inplace=True) # 결측치 처리
    df["y"] = df["y"].astype(int) # 데이터 타입 변경

    # X축 -> 숫자
    char_map = {
        'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8,
        'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                }
    
    df["x"] = df["location"].str.extract('([A-Z])')[0].values # 글자만 (X축 정보) 추출
    df["x"].replace(char_map, inplace=True) # 숫자로 변경
    df['x'].fillna("0",inplace=True) # 결측치 처리
    df["x"] = df["x"].astype(int) # 데이터 타입 변경
    
    # 데이터의 첫 글자가 숫자인지 아닌지 여부
    df["direction_of_play"] = df["location"].apply(lambda x: 1 if str(x)[0].isdigit() else 0)
    
    # 타일의 위치 정보를 나타내지 않은 것의 개수 (. , (time) 등등)
    df["curr_board_pieces_used"] = df["move"].apply(lambda x: str(x).count(".") + sum(int(c.islower()) for c in str(x)))
    
    # 통계값을 활용한 전처리

    # 평균값을 구할 변수
    avg_features = ["points", "move_len", "difficult_word", "curr_board_pieces_used", "direction_of_play"]
    # 합계를 구할 변수
    sum_features = ["difficult_word", "rack_len_less_than_7"]+dummy_features
    # 최대값을 구할 변수
    max_features = ["move_len", "points"]
    agg_func = {feature:'sum' for feature in dummy_features}
    agg_func.update({
        "points":["mean", "max"],
        "move_len":["mean", "max"],
        "difficult_word":["mean", "sum"],
        "curr_board_pieces_used": "mean",
        "direction_of_play": "mean",
        "rack_len_less_than_7" : "sum"
    }
    )

    turns_grouped = df.groupby(["game_id", "nickname"], as_index=False).agg(agg_func)
    
    turns_grouped.columns = ["_".join(a) if a[0] not in ["game_id", "nickname"] else a[0] for a in turns_grouped.columns.to_flat_index()]
    
    print('DONE : create_turn_features')

    return turns_grouped

### Create Train / Test Data

In [None]:
def load_data(bot_names =["BetterBot", "STEEBot", "HastyBot", "MasterBot"], cat_features=[]):
    
    # 파일 경로
    ROOT_DIR = '/content/'
    
    # 데이터 불러오기
    train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
    test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
    turns = pd.read_csv(os.path.join(ROOT_DIR, "turns.csv"))
    games = pd.read_csv(os.path.join(ROOT_DIR, "games.csv"))
    
    # Train & Test 데이터 셋 결합
    df = pd.concat([train, test])
    
    # 전처리
    
    # turns 데이터 전처리
    turns_fe_df = create_turn_features(turns)

    # 전처리된 turns 데이터 결합
    df = df.merge(turns_fe_df, how="left", on=["game_id", "nickname"])
    
    # Bot 관련된 데이터 플레임 생성
    bot_df = df[["game_id", "nickname", "score", "rating"]].copy()
    bot_df['bot_name'] = bot_df['nickname'].apply(lambda x: x if x in bot_names else np.nan)
    bot_df = bot_df[["game_id", "score", "rating", "bot_name"]].dropna(subset=["bot_name"])
    bot_df.columns = ["game_id", "bot_score", "bot_rating", "bot_name"]
    
    # Human 관련된 데이터 플레임 생성 및 Bot 관련된 데이터 & games 데이터 결합
    df = df[~df['nickname'].isin(bot_names)] # Human 관련된 데이터 플레임 생성
    df = df.merge(bot_df, on="game_id") # Bot 관련된 데이터 결합
    df = df.merge(games, on="game_id") # games 관련된 데이터 결합
    df["created_at"] = pd.to_datetime(df["created_at"]) # 데이터 타임 수정
    
    # 랭킹 1500등 계정 전처리
    users= df[df["rating"]==1500]["nickname"]
    anamolous = df[df["nickname"].isin(users)].groupby("nickname").agg({'nickname':'count',
                                                         'rating' : lambda x : np.sum(x == 1500)
                                                         })
    
    # 이상치 제거 : 너무 잘하는 사람

    # 랭킹 1500등에 달성할 확률 
    anamolous["ratio"] = anamolous["rating"] / anamolous["nickname"]

    # 너무 잘하는 유저
    anamolous_users = anamolous[(anamolous["ratio"] >=1.0) & (anamolous["nickname"]>1)].index

    # 이상치 제거
    df = df[~df["nickname"].isin(anamolous_users)]
    
    # 범주형 데이터 지정
    for name in cat_features:
        df[name] = df[name].astype("category")

        # 카테고리에 'None'값 추가
        # if "None" not in df[name].cat.categories:
        #     df[name].cat.add_categories("None", inplace=True)

    # Split Train / Test
    train = df[df["game_id"].isin(train["game_id"])].set_index("game_id")
    test = df[df["game_id"].isin(test["game_id"])].set_index("game_id")

    print('\nDONE : load_data\n')

    return train, test

In [None]:
cat_features = ["nickname","bot_name", 
                "time_control_name", "first",
                "game_end_reason", "winner",
                "lexicon", "rating_mode"]

In [None]:
train, test = load_data(cat_features = cat_features)

In [None]:
train.head()

In [None]:
test.head()

# Preprocessing

## create_cumm_player_features_overall

In [None]:
def create_cumm_player_features_overall(df):

    df = df[["nickname", "created_at","score","winner", "game_duration_seconds"]]
    
    df= df.sort_values(by="created_at")

    #Initialize our new variables with 0's
    df["cumm_avg_player_score"] = np.zeros(len(df))
    df["cumm_player_wins"] = np.zeros(len(df))
    df["cumm_avg_player_win_ratio"] = np.zeros(len(df))
    df["cumm_avg_game_duration_seconds"] = np.zeros(len(df))

    # 닉네임 별
    for nickname in tqdm(df["nickname"].unique()):

        # 게임이 플레이 되기 전에는 현재의 데이터를 알 수 없기에 이전 데이터를 활용함
        
        # 플레이어 평균 점수
        df.loc[df["nickname"]==nickname, "cumm_avg_player_score"]= np.append(0, df[df["nickname"]==nickname]["score"].expanding(min_periods=1).mean().values[:-1])
        
        # 플레이어 우승 횟수 (우승 : 1, 무승부 : 0, 패배 : -1)
        df.loc[df["nickname"]==nickname, "cumm_player_wins"]= np.append(0, df[df["nickname"]==nickname]["winner"].expanding(min_periods=1).sum().values[:-1])
        
        # 플레이어 우승 비율 (우승 횟수 / 경기 수)
        df.loc[df["nickname"]==nickname, "cumm_avg_player_win_ratio"]= \
        df[df["nickname"]==nickname]["cumm_player_wins"] / np.append(0, df[df["nickname"]==nickname]["winner"].expanding(min_periods=1).count().values[:-1])
        
        # 플레이어 평균 경기 시간
        df.loc[df["nickname"]==nickname, "cumm_avg_game_duration_seconds"]= \
        np.append(0, df[df["nickname"]==nickname]["game_duration_seconds"].expanding(min_periods=2).mean().values[:-1])
        
    # 결측치 처리
    df[["cumm_avg_player_score", "cumm_player_wins", "cumm_avg_player_win_ratio", "cumm_avg_game_duration_seconds"]]\
    = df[["cumm_avg_player_score", "cumm_player_wins", "cumm_avg_player_win_ratio", "cumm_avg_game_duration_seconds"]].fillna(0)
    
    df = df.sort_index()
    
    print('DONE : create_cumm_player_features_overall')

    return df[["cumm_avg_player_score", "cumm_player_wins", "cumm_avg_player_win_ratio", "cumm_avg_game_duration_seconds"]]

## create_cumm_player_features_bot

In [None]:
def create_cumm_player_features_bot(df):

    df= df[["nickname", "created_at","score","winner","bot_name", "game_duration_seconds"]]
    
    df= df.sort_values(by="created_at")

    for bot_name in df["bot_name"].unique():
        df["cumm_avg_player_score_"+str(bot_name)] = np.zeros(len(df))
        df["cumm_player_wins_"+str(bot_name)] = np.zeros(len(df))
        df["cumm_avg_player_win_ratio_"+str(bot_name)] = np.zeros(len(df))
        df["cumm_avg_game_duration_seconds_"+str(bot_name)] = np.zeros(len(df))

    for nickname in tqdm(df["nickname"].unique()):
        
        # 상대 Bot 별
        for bot_name in df["bot_name"].unique():
            
            # 게임이 플레이 되기 전에 현재 데이터 값을 알 수 없기에 이전 데이터를 활용함

            # 플레이어 평균 점수
            df.loc[(df["nickname"]==nickname) & (df["bot_name"]==bot_name), "cumm_avg_player_score_"+str(bot_name)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["score"].expanding(min_periods=1).mean().values[:-1])
            
            # 플레이어 우승 횟수 (우승 : 1, 무승부 : 0, 패배 : -1)
            df.loc[(df["nickname"]==nickname) & (df["bot_name"]==bot_name), "cumm_player_wins_"+str(bot_name)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["winner"].expanding(min_periods=1).sum().values[:-1])
            
            # 플레이어 평균 우승 비율
            df.loc[(df["nickname"]==nickname) & (df["bot_name"]==bot_name), "cumm_avg_player_win_ratio_"+str(bot_name)]= \
            df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["cumm_avg_player_win_ratio_"+str(bot_name)] / np.append(0, df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["winner"].expanding(min_periods=1).count().values[:-1])
            
            # 플레이어 평균 경기 시간
            df.loc[(df["nickname"]==nickname) & (df["bot_name"]==bot_name), "cumm_avg_game_duration_seconds_"+str(bot_name)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["game_duration_seconds"].expanding(min_periods=1).mean().values[:-1])
            
    #fill in any missing values with 0
    for bot_name in df["bot_name"].unique():
        df[["cumm_avg_player_score_"+str(bot_name), "cumm_player_wins_"+str(bot_name), "cumm_avg_player_win_ratio_"+str(bot_name), "cumm_avg_game_duration_seconds_"+str(bot_name)]] = \
        df[["cumm_avg_player_score_"+str(bot_name), "cumm_player_wins_"+str(bot_name), "cumm_avg_player_win_ratio_"+str(bot_name), "cumm_avg_game_duration_seconds_"+str(bot_name)]].fillna(0)
    
    # resort the data by the the index (i.e. game number)
    df = df.sort_index()

    print('DONE : create_cumm_player_features_bot')
    
    # 아래의 변수가 없는 변수들만 사용
    return df[df.columns.difference(["nickname", "created_at","score","winner","bot_name", "game_duration_seconds"])]

## create_cumm_player_features_lexicon

In [None]:
def create_cumm_player_features_lexicon(df):

    df= df[["nickname", "created_at","score","winner","lexicon",  "game_duration_seconds"]]

    df= df.sort_values(by="created_at")

    for lexicon in df["lexicon"].unique():
        df["cumm_avg_player_score_"+str(lexicon)] = np.zeros(len(df))
        df["cumm_player_wins_"+str(lexicon)] = np.zeros(len(df))
        df["cumm_avg_player_win_ratio_"+str(lexicon)] = np.zeros(len(df))
        df["cumm_avg_game_duration_seconds_"+str(lexicon)] = np.zeros(len(df))

    for nickname in tqdm(df["nickname"].unique()):

        # 사용한 어휘 사전 별
        for lexicon in df["lexicon"].unique():

            # 게임이 플레이 되기 전에 현재 데이터 값을 알 수 없기에 이전 데이터를 활용함
            
            # 플레이어 평균 점수
            df.loc[(df["nickname"]==nickname) & (df["lexicon"]==lexicon), "cumm_avg_player_score_"+str(lexicon)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["lexicon"]==lexicon)]["score"].expanding(min_periods=1).mean().values[:-1])
            
            # 플레이어 우승 횟수 (우승 : 1, 무승부 : 0, 패배 : -1)
            df.loc[(df["nickname"]==nickname) & (df["lexicon"]==lexicon), "cumm_player_wins_"+str(lexicon)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["lexicon"]==lexicon)]["winner"].expanding(min_periods=1).sum().values[:-1])
            
            # 플레이어 평균 우승 비율
            df.loc[(df["nickname"]==nickname) & (df["lexicon"]==lexicon), "cumm_avg_player_win_ratio_"+str(lexicon)]= \
            df[(df["nickname"]==nickname) & (df["lexicon"]==lexicon)]["cumm_avg_player_win_ratio_"+str(lexicon)] / np.append(0, df[(df["nickname"]==nickname) & (df["lexicon"]==lexicon)]["winner"].expanding(min_periods=1).count().values[:-1])
            
            # 플레이어 평균 경기 시간
            df.loc[(df["nickname"]==nickname) & (df["lexicon"]==lexicon), "cumm_avg_game_duration_seconds_"+str(lexicon)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["lexicon"]==lexicon)]["game_duration_seconds"].expanding(min_periods=1).mean().values[:-1])

    # 결측치 처리
    for lexicon in df["lexicon"].unique():
        df[["cumm_avg_player_score_"+str(lexicon), "cumm_player_wins_"+str(lexicon), "cumm_avg_player_win_ratio_"+str(lexicon), "cumm_avg_game_duration_seconds_"+str(lexicon)]] = \
        df[["cumm_avg_player_score_"+str(lexicon), "cumm_player_wins_"+str(lexicon), "cumm_avg_player_win_ratio_"+str(lexicon), "cumm_avg_game_duration_seconds_"+str(lexicon)]].fillna(0)
    
    df = df.sort_index()

    print('DONE : create_cumm_player_features_lexicon')

    # 아래의 변수가 없는 변수들만 사용
    return df[df.columns.difference(["nickname", "created_at","score","winner","lexicon", "game_duration_seconds"])]

## create_cumm_player_game_features

In [None]:
def create_cumm_player_game_features(df):
    
    df = df[["nickname", "created_at", "bot_name", "rating_mode", "lexicon", "game_end_reason"]]
    
    # One-Hot Encoder
    encoder = ce.OneHotEncoder(cols=["bot_name", "rating_mode", "lexicon", "game_end_reason"], use_cat_names=True)
    df = df.join(encoder.fit_transform(df[["bot_name", "rating_mode", "lexicon", "game_end_reason"]]))
    
    df= df.sort_values(by="created_at")
    
    # One-Hot Encoder한 변수 생성 (갯수 합계로 채울 예정)
    for feature_name in encoder.get_feature_names():
        df["cumm_"+str(feature_name)+"_counts"] = np.zeros(len(df))

    # 닉네임 별
    for nickname in tqdm(df["nickname"].unique()):

        # 원-핫 인코딩한 변수 별
        for feature_name in encoder.get_feature_names():

            # 게임이 플레이 되기 전에 현재 데이터 값을 알 수 없기에 이전 데이터를 활용함

            # 원-핫 인코딩한 변수별 합계 구함
            df.loc[df["nickname"]==nickname, "cumm_"+str(feature_name)+"_counts"]= \
            np.append(0, df[df["nickname"]==nickname][feature_name].expanding(min_periods=1).sum().values[:-1])

    # 결측치 제거
    for feature_name in encoder.get_feature_names():
        df["cumm_"+str(feature_name)+"_counts"] = df["cumm_"+str(feature_name)+"_counts"].fillna(0)
        
    df = df.sort_index()
    
    print('DONE : create_cumm_player_game_features')

    return df[df.columns.difference(["nickname", "created_at", "bot_name", "rating_mode", "lexicon", "game_end_reason"]+encoder.get_feature_names())]

## create_cumm_bot_features

In [None]:
def create_cumm_bot_features(df):

    df= df[["nickname", "created_at","bot_name", "bot_score", "bot_rating"]]

    df= df.sort_values(by="created_at")

    for bot_name in df["bot_name"].unique():
        df["cumm_avg_bot_score_"+str(bot_name)] = np.zeros(len(df))
        df["cumm_avg_bot_rating_"+str(bot_name)] = np.zeros(len(df))


    for nickname in tqdm(df["nickname"].unique()):
        for bot_name in df["bot_name"].unique():

            # 게임이 플레이 되기 전에 현재 데이터 값을 알 수 없기에 이전 데이터를 활용함
            # 그러나 'bot rating'의 경우 게임 플레이하기 전에도 알 수 있기에 적용에서 제외됨

            # Bot의 평균 점수
            df.loc[(df["nickname"]==nickname) & (df["bot_name"]==bot_name), "cumm_avg_bot_score_"+str(bot_name)]= \
            np.append(0, df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["bot_score"].expanding(min_periods=1).mean().values[:-1])
            
            # Bot의 평균 랭킹
            df.loc[(df["nickname"]==nickname) & (df["bot_name"]==bot_name), "cumm_avg_bot_rating_"+str(bot_name)]= \
            df[(df["nickname"]==nickname) & (df["bot_name"]==bot_name)]["bot_rating"].expanding(min_periods=1).mean().values

    # 결측치 제거       
    for bot_name in df["bot_name"].unique():
        df[["cumm_avg_bot_score_"+str(bot_name), "cumm_avg_bot_rating_"+str(bot_name)]] = \
        df[["cumm_avg_bot_score_"+str(bot_name), "cumm_avg_bot_rating_"+str(bot_name)]].fillna(0)
    
    df = df.sort_index()
    
    print('DONE : create_cumm_bot_features')
    
    return df[df.columns.difference(["nickname", "created_at","bot_name", "bot_score", "bot_rating"])]

## create_cumm_turns_features

In [None]:
def create_cumm_turns_features(df):

    turn_features = ['turn_type_Play_sum',
       'turn_type_End_sum', 'turn_type_Exchange_sum', 'turn_type_Pass_sum',
       'turn_type_Timeout_sum', 'turn_type_Challenge_sum',
       'turn_type_Six-Zero Rule_sum', 'turn_type_None_sum', 'points_mean',
       'points_max', 'move_len_mean', 'move_len_max', 'difficult_word_mean',
       'difficult_word_sum', 'curr_board_pieces_used_mean',
       'direction_of_play_mean', 'rack_len_less_than_7_sum']
    
    df = df[["nickname", "created_at"]+turn_features]
    
    df= df.sort_values(by="created_at")
    
    for nickname in tqdm(df["nickname"].unique()):

        # turns 관련 변수 별
        for feature_name in turn_features:

            # 게임이 플레이 되기 전에 현재 데이터 값을 알 수 없기에 이전 데이터를 활용함

            # 유저별 urns 관련 변수들의 평균값
            df.loc[df["nickname"]==nickname, "cumm_"+str(feature_name)+"_average"]= \
            np.append(0, df[df["nickname"]==nickname][feature_name].expanding(min_periods=1).mean().values[:-1])

    # 결측치 제거
    for feature_name in turn_features:
        df["cumm_"+str(feature_name)+"_average"] = df["cumm_"+str(feature_name)+"_average"].fillna(0)
    
    df = df.sort_index()

    print('DONE : create_cumm_turns_features')

    return df[df.columns.difference(["nickname", "created_at"]+turn_features)]

## Finalize Features

In [None]:
# 모든 전처리 기법 사용
def create_features(df, df_test=None):
    X_raw = df.copy()
    y = df['rating'].copy()
    
    # test 데이터 존재하면
    if df_test is not None:
        X_test = df_test.copy()

        # train 데이터 & test 데이터 결합
        X_raw = pd.concat([X_raw, X_test])
        
    # 모든 전처리 기법 적용
    X = create_cumm_player_features_lexicon(X_raw)
    X = X.join(create_cumm_player_game_features(X_raw))
    X = X.join(create_cumm_player_features_overall(X_raw))
    X = X.join(create_cumm_bot_features(X_raw))
    X = X.join(create_cumm_turns_features(X_raw))

    
    # Reform splits (데이터 무작위로 섞임)

    # test 데이터 존재하면
    if df_test is not None:

        # test 데이터 구별
        X_test = X.loc[df_test.index, :]

        # train 데이터 구별
        X.drop(df_test.index, inplace=True)
    
    if df_test is not None:
        return X, X_test

    else:
        return X

In [None]:
X, X_test = create_features(train, test)
y = train['rating'].copy()

# Modeling

In [None]:
path_stacking_data = '/content/drive/MyDrive/머신러닝 엔지니어링/Kaggle/Scrabble Player Rating/for_stacking_data/'

## LGBM

### Optuna

In [None]:
def objective(trial, X, y):
    
    # 적용(수색)할 최적의 하이퍼 파라미터 범위 지정
    param = {
        "objective": "regression",
        "verbosity": -1,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-10, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-10, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 60),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 500),
    }
    
    # LGBM reg 모델 사용 / 교차 검증 사용
    lgbcv = lgb.cv(param,
                   lgb.Dataset(X, label=y),
                   folds= KFold(n_splits=5, shuffle=True),
                   verbose_eval=False,                   
                   early_stopping_rounds=200,                   
                   num_boost_round=50000
                  )
    
    cv_score = lgbcv['l2-mean'][-1] # MSE
    
    # Return metric of interest
    return cv_score

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING) 
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X, y),
               timeout=1800,
               n_trials=150,
               n_jobs=1,
               show_progress_bar=True) 

In [None]:
print(study.best_params)

In [None]:
print(study.best_value**0.5)

### CV Stacking

In [None]:
def get_stacking_base_datasets(X, y, df_test, n_splits, n_repeats):

  num_model = 0

  train_fold_pred = np.zeros((X.shape[0] ,1 ))
  test_pred = np.zeros((df_test.shape[0], n_splits))

  skf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)

  num_fold = 0

  for fold_idx, (train_index, valid_index) in enumerate(tqdm(skf.split(X, y))):
      X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
      y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

      lgb_train = lgb.Dataset(X_train, y_train)
      lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
      lgb_params = {
          'objective': 'regression',
          'verbose': 5000,
          'n_estimators': 50000,
          **study.best_params
      }

      print(f'\nNumber fold : {num_fold}\n')
      model = lgb.train(lgb_params,
                        lgb_train,
                        valid_sets=lgb_eval,
                        verbose_eval=False,
                        callbacks=[lgb.early_stopping(100)])
      
      y_pred = model.predict(X_valid)

      MSE = mean_squared_error(y_pred, y_valid)
      print(f'\n{num_fold} 번 Model Vaild MSE : {MSE}\n')

      train_fold_pred[valid_index, :] = y_pred.reshape(-1,1)
      test_pred[:, num_model] = model.predict(df_test)

      num_model += 1

  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1) 

  return train_fold_pred , test_pred_mean

In [None]:
stacking_train, stacking_test = get_stacking_base_datasets(X, y, X_test, 5, 3)

In [None]:
LGBM_Stacking_train = pd.DataFrame(stacking_train)
LGBM_Stacking_train['target'] = y

LGBM_Stacking_test = pd.DataFrame(stacking_test)

In [None]:
LGBM_Stacking_train.dropna(inplace=True)

In [None]:
LGBM_Stacking_train.to_csv(path_stacking_data+'LGBM_Stacking_train.csv')
LGBM_Stacking_test.to_csv(path_stacking_data+'LGBM_Stacking_test.csv')

## CatBoost

### Optuna

In [None]:
def objective(trial, X, y):

    cv_dataset = Pool(data=X,
                   label=y)
    
    # 적용(수색)할 최적의 하이퍼 파라미터 범위 지정
    param = {
    'loss_function':'RMSE',
    'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    "max_depth":trial.suggest_int("max_depth", 4, 16),
    'random_strength' :trial.suggest_int('random_strength', 0, 100),
    "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    "min_child_samples": trial.suggest_int("min_child_samples", 1, 500),
    "max_bin": trial.suggest_int("max_bin", 100, 500)
    }

    scores = cv(cv_dataset,
                param,
                folds= KFold(n_splits=5, shuffle=True),
                verbose_eval=False,                   
                early_stopping_rounds=200, 
                num_boost_round=50000)
    
    cv_score = list(scores['test-RMSE-mean'])[-1]

    # Return metric of interest
    return cv_score

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING) 
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X, y),
               timeout=1800,
               n_trials=100,
               n_jobs=1,
               show_progress_bar=True) 

In [None]:
print(study.best_params)

In [None]:
print(study.best_value**0.5)

### CV Stacking

In [None]:
def get_stacking_base_datasets(X, y, df_test, n_splits, n_repeats):

  num_model = 0

  train_fold_pred = np.zeros((X.shape[0] ,1 ))
  test_pred = np.zeros((df_test.shape[0], n_splits))

  skf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)

  num_fold = 0

  for fold_idx, (train_index, valid_index) in enumerate(tqdm(skf.split(X, y))):
      X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
      y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

      cat_params = {
          'objective': 'regression',
          'verbose': 100,
          'n_estimators': 50000,
          **study.best_params
      }

      print(f'\nNumber fold : {num_fold}\n')

      model = CatBoostRegressor(**cat_params)

      model.fit(X_train, y_train,
                eval_set=(X_valid, y_valid),
                early_stopping_rounds=100,
                use_best_model=True,
                verbose=100)
      
      y_pred = model.predict(X_valid)

      MSE = mean_squared_error(y_pred, y_valid)
      print(f'\n{num_fold} 번 Model Vaild MSE : {MSE}\n')

      train_fold_pred[valid_index, :] = y_pred.reshape(-1,1)
      test_pred[:, num_model] = model.predict(df_test)

      num_model += 1

  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1) 

  return train_fold_pred , test_pred_mean

In [None]:
stacking_train, stacking_test = get_stacking_base_datasets(X, y, X_test, 5, 3)

In [None]:
CAT_Stacking_train = pd.DataFrame(stacking_train)
CAT_Stacking_train['target'] = y

CAT_Stacking_test = pd.DataFrame(stacking_test)

In [None]:
CAT_Stacking_train.dropna(inplace=True)

In [None]:
CAT_Stacking_train.to_csv(path_stacking_data+'CAT_Stacking_train.csv')
CAT_Stacking_test.to_csv(path_stacking_data+'CAT_Stacking_test.csv')

## XGBoost

### Optuna

In [None]:
from xgboost import cv
def objective(trial, X, y):

    dtrain_matrix = xgb.DMatrix(X, label=y)
    
    # 적용(수색)할 최적의 하이퍼 파라미터 범위 지정
    param = {
        'n_estimators': 5000,
        'max_depth': trial.suggest_int('max_depth', 8, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
    }

    xgbcv = xgb.cv(
                  params = param,
                  dtrain = dtrain_matrix,
                  num_boost_round=50000, 
                  folds= KFold(n_splits=5, shuffle=True),
                  verbose_eval=False, 
                  metrics = 'rmse',
                  early_stopping_rounds = 200,
                  )
    
    cv_score = list(xgbcv['test-rmse-mean'])[-1]
    
    # Return metric of interest
    return cv_score

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING) 
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X, y),
               timeout=1800,
               n_trials=100,
               n_jobs=1,
               show_progress_bar=True) 

In [None]:
print(study.best_params)

In [None]:
print(study.best_value**0.5)

### CV Stacking

In [None]:
def get_stacking_base_datasets(X, y, df_test, n_splits, n_repeats):

  num_model = 0

  train_fold_pred = np.zeros((X.shape[0] ,1 ))
  test_pred = np.zeros((df_test.shape[0], n_splits))

  skf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)

  num_fold = 0

  for fold_idx, (train_index, valid_index) in enumerate(tqdm(skf.split(X, y))):
      X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
      y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

      xgb_params = {
        'n_estimators': 50000,
        'verbose':1000,
        **study.best_params
    }

      print(f'\nNumber fold : {num_fold}\n')

      model = XGBRegressor(**xgb_params)

      model.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                use_best_model=True,
                verbose=100)
      
      y_pred = model.predict(X_valid)

      MSE = mean_squared_error(y_pred, y_valid)
      print(f'\n{num_fold} 번 Model Vaild MSE : {MSE}\n')

      train_fold_pred[valid_index, :] = y_pred.reshape(-1,1)
      test_pred[:, num_model] = model.predict(df_test)

      num_model += 1

  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1) 

  return train_fold_pred , test_pred_mean

In [None]:
stacking_train, stacking_test = get_stacking_base_datasets(X, y, X_test, 5, 3)

In [None]:
XGB_Stacking_train = pd.DataFrame(stacking_train)
XGB_Stacking_train['target'] = y

XGB_Stacking_test = pd.DataFrame(stacking_test)

In [None]:
XGB_Stacking_train.dropna(inplace=True)

In [None]:
XGB_Stacking_train.to_csv(path_stacking_data+'CAT_Stacking_train.csv')
XGB_Stacking_test.to_csv(path_stacking_data+'CAT_Stacking_test.csv')

## Meta Model

In [None]:
dict_train = {
    'LGBM':LGBM_Stacking_train['0'],
    'CAT':CAT_Stacking_train['0'],
    'XGB':XGB_Stacking_train['0'],
    'target':LGBM_Stacking_train['target']
}

train = pd.DataFrame(dict_train)

In [None]:
dict_test = {
    'LGBM':LGBM_Stacking_test['0'],
    'CAT':CAT_Stacking_test['0'],
    'XGB':XGB_Stacking_test['0']
}

test = pd.DataFrame(dict_test)

In [None]:
X = train.drop(columns='target')
y = train['target']

In [None]:
svr = LinearSVR(max_iter= 1000000, verbose=1)

svr.fit(X, y)

In [None]:
test['rating'] = svr.predict(test)
submission = test['rating']

save_path = '/content/drive/MyDrive/머신러닝 엔지니어링/Kaggle/Scrabble Player Rating/'
submission.to_csv(save_path+"stacking_LGBM_CAT.csv")