# Categorical Boost
#### cv score :  0.5542533081285443 (5 folds)

In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import sklearn
from Functions import prepro
import warnings

import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')
from catboost import Pool, CatBoostClassifier, CatBoost


random.seed(0)

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

In [2]:
train_df = pd.read_csv("../../data/Processed/train2.csv")
test_df = pd.read_csv("../../data/Processed/test2.csv")
print(train_df.shape)
print(test_df.shape)

(66125, 192)
(28340, 191)


In [3]:
y = train_df["y"].values
train_df = train_df.drop("y", axis=1)

In [4]:
# add disconnection info

train_df = prepro.add_disconnection(train_df)
test_df = prepro.add_disconnection(test_df)

In [5]:
# add info about numeric column
# 武器データの中のnum_colsを持つ特徴について、チーム内の統計量を計算、levelは結構効いてる

num_cols = ["level", "range", "rapid-", "atack", "ink-sub", "fav-main", "good-special"]

#train_df = prepro.flat(train_df, num_cols)
#test_df = prepro.flat(test_df, num_cols)

train_df = prepro.add_numeric_info(train_df, num_cols)
test_df = prepro.add_numeric_info(test_df, num_cols)

# 欠損値埋め先にやろう
欠損値がある列

rank, weapon, level ⇨　回線切れ or ナワバリ

weaponの欠損に並んでそれに関係ある列、A3, A4, B3, B4 ⇨ 回線切れ

level, weaponが消えていたら回線切れ

In [6]:
# rankの欠損値を埋める
train_df, test_df = prepro.fillna_rank(train_df, test_df)

#そのほかの欠損値を埋める
train_df, test_df = prepro.fillna(train_df, test_df)

In [7]:
print(train_df.isnull().sum().sum())
print(test_df.isnull().sum().sum())

0
0


# 特徴量エンジニアリング

In [8]:
# count weapon
train_df, test_df = prepro.count_weapon(train_df, test_df)

# count weapon by mode
train_df, test_df = prepro.count_weapon_by_mode(train_df, test_df)


# 水増し, A1も統計量に含めた特徴を作る場合は水ましより先にやる
train_df, y = prepro.mizumashi(train_df, y)

# is_nawabari
train_df, test_df = prepro.is_nawabari(train_df, test_df)

# match rank
train_df, test_df = prepro.match_rank(train_df, test_df)

In [9]:
# add team info

train_df,  test_df = prepro.addTeamInfo(train_df, test_df, cols=["special", "subweapon", "category1", "category2", "mainweapon"])

(160590, 295)
special-A
special-B
subweapon-A
subweapon-B
category1-A
category1-B
category2-A
category2-B
mainweapon-A
mainweapon-B
(160590, 489)
complete


In [10]:
# categorize team

categorize_col = ["category1", "category2", "subweapon", "special", "mainweapon"]
for col in categorize_col:
    print(col)
    train_df_, test_df_ = prepro.categorize_team(train_df, test_df, col)

category1
category2
subweapon
special
mainweapon


In [11]:
# make input

drop_cols = ["id", "lobby", "lobby-mode",  "period", "game-ver", "A1-weapon", "A2-weapon", "A3-weapon", "A4-weapon", \
              "B1-weapon", "B2-weapon", "B3-weapon", "B4-weapon"]

X, test_X = prepro.make_input(train_df, test_df, drop_cols, categorical_encode=False, scaler=False, verbose=False)

categorical_features_indices = np.where(X.dtypes == "object")[0]

complete


# 学習

In [13]:
# 全データを5つに分割
random.seed(random.randint(0, 10000))
SIZE = X.shape[0]
K = 5
cat_tgtenc_cols = ["mode", "stage", "team-category1-A", "team-category1-B"]
#cat_tgtenc_cols = ["mode", "stage"]

#folds = prepro.make_kfolds(SIZE, K)
folds = prepro.make_stratified_kfolds(X, y, K)

print(len(folds))
for i, fold in enumerate(folds):
    print("fold ", i+1, " size is ", len(fold))
    
    
if SIZE != len(set(sum(folds, []))):
    print("error is occuring in spliting")
else :
    print("successfully split")
    
params = {
    "loss_function" : "Logloss",
    "eval_metric" : "Logloss",
    "iterations":2000,
    "learning_rate" : 0.05,
    "use_best_model": True,
    "random_seed":1,

    

}

THRESHOLD = 0.50
models = []
cv_scores = []
temp = 0
train_pred = []
valid_ys = []

all_indices = sum(folds, [])
for i in range(K):
    valid_indices = folds[i]
    train_indices = list(set(all_indices) - set(valid_indices))
    # print("train ", len(train_indices), " , valid ", len(valid_indices))
    

    train_X = X.iloc[train_indices]
    try:
        train_y = y.iloc[train_indices]
    except:
        train_y = y[train_indices]
    valid_X = X.iloc[valid_indices]
    try:
        valid_y = y.iloc[valid_indices]
    except:
        valid_y = y[valid_indices]
    
    
    
    
    train_data = Pool(train_X, train_y, cat_features=categorical_features_indices)
    valid_data = Pool(valid_X, valid_y, cat_features=categorical_features_indices)
    
    model = CatBoostClassifier(**params)

    
    model.fit(
        train_data,
        eval_set=valid_data,
        early_stopping_rounds=10,
        verbose=10,
        use_best_model=True,
    )
    
    pred = model.predict(valid_X)
    # pred = np.where(pred < THRESHOLD, 0, 1)
    train_pred.append(pred)
    temp += np.sum(pred) 
    
    score = accuracy_score(pred, valid_y)
    
    models.append(model)
    cv_scores.append(score)
    
    
print("cv score : ", np.mean(cv_scores))    
print("cv ratio : ", temp / SIZE)

5
fold  1  size is  26450
fold  2  size is  26450
fold  3  size is  26450
fold  4  size is  26450
fold  5  size is  26450
successfully split
0:	learn: 0.6927700	test: 0.6928371	best: 0.6928371 (0)	total: 458ms	remaining: 15m 15s
10:	learn: 0.6901541	test: 0.6907902	best: 0.6907902 (10)	total: 4.19s	remaining: 12m 38s
20:	learn: 0.6881926	test: 0.6892783	best: 0.6892783 (20)	total: 7.91s	remaining: 12m 25s
30:	learn: 0.6869520	test: 0.6884270	best: 0.6884270 (30)	total: 11.6s	remaining: 12m 15s
40:	learn: 0.6860099	test: 0.6877634	best: 0.6877634 (40)	total: 15.5s	remaining: 12m 20s
50:	learn: 0.6850199	test: 0.6870361	best: 0.6870361 (50)	total: 19.4s	remaining: 12m 21s
60:	learn: 0.6841980	test: 0.6866032	best: 0.6866032 (60)	total: 23.4s	remaining: 12m 24s
70:	learn: 0.6833917	test: 0.6861501	best: 0.6861501 (70)	total: 27.2s	remaining: 12m 19s
80:	learn: 0.6827682	test: 0.6858206	best: 0.6858206 (80)	total: 31.4s	remaining: 12m 23s
90:	learn: 0.6820403	test: 0.6854540	best: 0.685454

70:	learn: 0.6833656	test: 0.6852318	best: 0.6852318 (70)	total: 28.9s	remaining: 13m 4s
80:	learn: 0.6827081	test: 0.6849625	best: 0.6849616 (79)	total: 33s	remaining: 13m 2s
90:	learn: 0.6820904	test: 0.6845986	best: 0.6845986 (90)	total: 37.3s	remaining: 13m 3s
100:	learn: 0.6815853	test: 0.6843791	best: 0.6843791 (100)	total: 41.6s	remaining: 13m 2s
110:	learn: 0.6810443	test: 0.6841788	best: 0.6841788 (110)	total: 46s	remaining: 13m 2s
120:	learn: 0.6804525	test: 0.6838673	best: 0.6838673 (120)	total: 50.5s	remaining: 13m 3s
130:	learn: 0.6799926	test: 0.6836386	best: 0.6836386 (130)	total: 54.4s	remaining: 12m 56s
140:	learn: 0.6795653	test: 0.6834621	best: 0.6834621 (140)	total: 58.5s	remaining: 12m 50s
150:	learn: 0.6789550	test: 0.6831909	best: 0.6831909 (150)	total: 1m 2s	remaining: 12m 50s
160:	learn: 0.6784971	test: 0.6829651	best: 0.6829651 (160)	total: 1m 7s	remaining: 12m 47s
170:	learn: 0.6780482	test: 0.6828175	best: 0.6828175 (170)	total: 1m 11s	remaining: 12m 41s
180

180:	learn: 0.6773021	test: 0.6832118	best: 0.6832089 (179)	total: 1m 20s	remaining: 13m 30s
190:	learn: 0.6768973	test: 0.6830942	best: 0.6830942 (190)	total: 1m 25s	remaining: 13m 28s
200:	learn: 0.6764122	test: 0.6829711	best: 0.6829711 (200)	total: 1m 29s	remaining: 13m 23s
210:	learn: 0.6759279	test: 0.6828183	best: 0.6828183 (210)	total: 1m 34s	remaining: 13m 19s
220:	learn: 0.6754345	test: 0.6826794	best: 0.6826794 (220)	total: 1m 38s	remaining: 13m 13s
230:	learn: 0.6749938	test: 0.6825749	best: 0.6825749 (230)	total: 1m 43s	remaining: 13m 10s
240:	learn: 0.6745001	test: 0.6825014	best: 0.6825014 (240)	total: 1m 47s	remaining: 13m 7s
250:	learn: 0.6738758	test: 0.6823561	best: 0.6823561 (250)	total: 1m 52s	remaining: 13m 4s
260:	learn: 0.6732260	test: 0.6822069	best: 0.6822069 (260)	total: 1m 56s	remaining: 12m 57s
270:	learn: 0.6726787	test: 0.6821932	best: 0.6821918 (266)	total: 2m 1s	remaining: 12m 53s
280:	learn: 0.6720800	test: 0.6820868	best: 0.6820868 (280)	total: 2m 5s	

In [None]:
preds = []
for i in range(K):
    model = models[i]
    pred = model.predict(test_X, prediction_type='Probability')[:,1]
    preds.append(pred)
    print(np.sum(pred) / pred.shape[0])
    
preds = np.array(preds)
preds = np.mean(preds, axis=0)
print(np.sum(preds) / preds.shape[0])


temp = pd.DataFrame({"pred":pred})
temp.hist(bins=20)

In [None]:
preds = np.where(preds < THRESHOLD, 0, 1)
print(np.sum(preds) / preds.shape[0])

submit_df = pd.DataFrame({'y': preds})
submit_df.index.name = 'id'
#submit_df.to_csv('../Submissions/submission_cat_3_{}.csv'.format(K))

# モデル解釈

In [None]:
importance = pd.DataFrame(models[0].get_feature_importance(), index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
display(importance)

In [None]:
train_df["pred"] = 0
train_df["y"] = y.values
for i in range(K):
    train_df["pred"].iloc[folds[i]] = train_pred[i]

In [None]:
result_df = train_df[train_df["y"] != train_df["pred"]]

In [None]:
print(result_df[result_df["mode"] == "nawabari"].shape[0]/result_df.shape[0])
print(result_df[result_df["mode"] == "hoko"].shape[0]/result_df.shape[0])
print(result_df[result_df["mode"] == "asari"].shape[0]/result_df.shape[0])
print(result_df[result_df["mode"] == "area"].shape[0]/result_df.shape[0])
print(result_df[result_df["mode"] == "yagura"].shape[0]/result_df.shape[0])

In [None]:
result_df[result_df["mode"] == "nawabari"]