In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

SEED =1234
random.seed(SEED)
np.random.seed(SEED)

In [70]:
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('data/train.csv',index_col=0)
train_data, val_data = train_test_split(train_data,test_size=0.2,shuffle=True,random_state=1234)
test_data = pd.read_csv('data/test.csv',index_col=0)

X_train = train_data.drop(['blueWins'],axis=1)
y_train = train_data['blueWins']

X_val = val_data.drop(['blueWins'],axis=1)
y_val = val_data['blueWins']

X_test = test_data

In [71]:
from sklearn import preprocessing

ss = preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)
X_test = ss.transform(X_test)

## lgb

In [73]:
import optuna.integration.lightgbm as lgb
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.model_selection import train_test_split
import lightgbm as lgbn
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# LightGBM用のデータセットに変換
lgb_train = lgb.Dataset(X_train, y_train)

# ハイパーパラメータサーチ&モデル構築
params = {'objective': 'binary',
        'metric': 'auc',
        'random_seed':1234,
        'n_jobs':-1,
        'force_row_wise':True,
        'verbose': -1 # これを指定しないと`No further splits with positive gain, best gain: -inf`というWarningが表示される
        } 

kf = KFold(n_splits=5, shuffle = True, random_state=1234)
# クロスバリデーションによるハイパーパラメータの探索 3fold
tuner = lgb.LightGBMTunerCV(
                        params, lgb_train,
                        callbacks=[
                                lgb.early_stopping(stopping_rounds=100,verbose=False),
                                lgb.log_evaluation(False)
                                ],
                        folds=kf
                        )
# ハイパーパラメータ探索の実行
tuner.run()
# サーチしたパラメータの表示
best_params = tuner.best_params

[32m[I 2023-01-26 14:54:04,190][0m A new study created in memory with name: no-name-6596fded-e5c6-4f0d-93e8-8676c8d30ae0[0m
feature_fraction, val_score: 0.871127:  14%|#4        | 1/7 [00:00<00:04,  1.44it/s][32m[I 2023-01-26 14:54:04,889][0m Trial 0 finished with value: 0.8711271852988014 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.8711271852988014.[0m
feature_fraction, val_score: 0.871127:  29%|##8       | 2/7 [00:01<00:03,  1.33it/s][32m[I 2023-01-26 14:54:05,682][0m Trial 1 finished with value: 0.8707726425746829 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.8711271852988014.[0m
feature_fraction, val_score: 0.871127:  43%|####2     | 3/7 [00:02<00:02,  1.36it/s][32m[I 2023-01-26 14:54:06,395][0m Trial 2 finished with value: 0.8711271852988014 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.8711271852988014.[0m
feature_fraction, val_score: 0.871127:  57%|#####7    | 4/7 [00:02<00:02,  1.44it

In [76]:
#パラメータをもとに再学習
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val,y_val)

model = lgbn.train(best_params, lgb_train,
                valid_sets=[lgb_train, lgb_val],
                callbacks=[
                        lgb.early_stopping(100),
                        lgb.log_evaluation(False),
                        ],
                num_boost_round =1000
                )

val_pred = model.predict(X_val)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[370]	training's auc: 0.896335	valid_1's auc: 0.871105


In [77]:
from sklearn.metrics import accuracy_score

def binary(pred_y):
    for i in range(len(pred_y)):
        if pred_y[i] > 0.5:
            pred_y[i] = int(1)
        else:
            pred_y[i] = int(0)
    return pred_y

val_pred = binary(val_pred)
accuracy_score(y_val,val_pred)

0.785625

In [22]:
pred_y = model.predict(X_test)
test_data['y'] = pred_y
test_data['y'] = test_data['y'].apply(int)
test_data['y'].to_csv('data/submit.csv',header=None)

## svm

In [79]:
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv=4,Cs=[0.1,1,10],random_state=1234,)
clf.fit(X_train,y_train)

LogisticRegressionCV(Cs=[0.1, 1, 10], cv=4, random_state=1234)

In [80]:
pred_val = clf.predict(X_val)

In [83]:
accuracy_score(y_val,pred_val)

0.713125

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# def param():
#     ret = {
#         'C':[1, 10, 100],
#         'kernel':['rbf', 'linear', 'poly'],
#         #'degree':np.arange(1, 6, 1),
#         #'gamma':np.linspace(0.01, 1.0, 50)
#         }
#     return ret

# clf = GridSearchCV(
#     SVC(), # 識別器
#     param(), # 最適化したいパラメータセット 
#     cv=4, # 交差検定の回数
#     verbose=2,
#     n_jobs=-1) # モデルの評価関数の指定
# clf.fit(X_train, y_train)

clf = SVC(C=1,kernel='rbf')
clf.fit(X_train,y_train)
pred = clf.predict(X_val)
print(accuracy_score(y_val,pred))



0.74125


In [None]:
best = clf.best_estimator_
pred = best.predict(X_test)

In [54]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import optuna

def objective(trial):
    params = {
        'C': trial.suggest_float('C', 1e-5, 1),
        'gamma': trial.suggest_float('gamma', 1e-5, 1 ),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'sigmoid'])
        }
    
    model = SVC(**params)
    model.fit(X_train,y_train)
    pred = model.predict(X_val)
    accuracy_test = accuracy_score(y_val,pred)
    
    return 1.0 - accuracy_test

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)