In [None]:
#!conda install -c conda-forge lightgbm
#!pip install optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import sys
import os
import re
import random

from time import time
from tqdm import tqdm

from contextlib import contextmanager
import lightgbm as lgb
#import optuna.integration.lightgbm as opt_lgb

In [None]:
#上限表示数を拡張
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 200)

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [None]:
INPUT = "/content/drive/MyDrive/nishika/" # 所望のディレクトリに変更してください。
train_df = pd.read_csv(os.path.join(INPUT, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT, "test.csv"))
submission_df = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

In [None]:
df_1_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_SwinTransformerLarge384_V2Large256.csv")
df_1_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_SwinTransformerLarge384_V2Large256.csv")

In [None]:
df_2_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efb2_rinnabase.csv")
df_2_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efb2_rinnabase.csv")

In [None]:
df_2_tr = df_2_tr.drop(df_2_tr.columns[0:352], axis=1)

In [None]:
df_2_ts = df_2_ts.drop(df_2_ts.columns[0:352], axis=1)

In [None]:
df_3_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_swinv2base256_ginza510electra.csv")
df_3_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_swinv2base256_ginza510electra.csv")

In [None]:
df_4_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efv2b2_resnet152_jaginza.csv")
df_4_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efv2b2_resnet152_jaginza.csv")

In [None]:
df_4_tr = df_4_tr.drop(df_4_tr.columns[0:865], axis=1)

In [None]:
df_4_ts = df_4_ts.drop(df_4_ts.columns[0:865], axis=1)

In [None]:
#swinL384,swinV2L256,Roberta-text,efb2
df_tmp_tr_1 = pd.concat([df_1_tr,df_2_tr],axis=1)
df_tmp_ts_1 = pd.concat([df_1_ts,df_2_ts],axis=1)

In [None]:
#ginza510-electra-base-text,swinv2base256
df_tmp_tr_2 = pd.concat([df_tmp_tr_1,df_3_tr],axis=1)
#df_tmp_tr_2 = pd.concat([df_1_tr,df_3_tr],axis=1)
df_tmp_ts_2 = pd.concat([df_tmp_ts_1,df_3_ts],axis=1)
#df_tmp_ts_2 = pd.concat([df_1_ts,df_3_ts],axis=1)

In [None]:
#efficientnetv2b2,resnet152,jaginza(ginza_electra_text)
df_tmp_tr_3 = pd.concat([df_tmp_tr_2,df_4_tr],axis=1)
df_tmp_ts_3 = pd.concat([df_tmp_ts_2,df_4_ts],axis=1)

In [None]:
train_tmp = df_tmp_tr_3.drop(["is_laugh","odai_photo_file_name"], axis=1)

In [None]:
test_tmp = df_tmp_ts_3.drop(["odai_photo_file_name"], axis=1)

In [None]:
train = pd.concat([train_tmp,train_df],axis=1)
test = pd.concat([test_tmp,test_df],axis=1)

In [None]:
train["text_len"] = train["text"].str.len()
test["text_len"] = test["text"].str.len()

In [None]:
y = train["is_laugh"]
X = train.drop(["odai_photo_file_name","is_laugh","id","text"], axis=1)

X_test = test.drop(["odai_photo_file_name","id","text"], axis=1)

In [None]:

X = X.T.drop_duplicates().T
X_test = X_test.T.drop_duplicates().T 


# Model

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
params = {  
    "n_estimators": 20000,
    "objective": 'binary',
    "learning_rate": 0.001,
    'lambda_l1': 8.553323365580143, 
    'lambda_l2': 0.00024389221237366783, 
    'num_leaves': 240, 
    'feature_fraction': 0.6839999999999999, 
    'bagging_fraction': 0.6378550454605715, 
    'bagging_freq': 1,
    "random_state": 71,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .8,
    "reg_lambda": 5,
    "max_depth":9,
    'min_child_samples': 20
    }

In [None]:
# 予測値を格納するdf
# df to store the predicted value
preds_lgb = pd.DataFrame()

for k, (tr_id, vl_id) in enumerate(kf.split(X, y)):
    print("="*50)
    print(f"               KFold{k+1}")
    print("="*50)
    
    X_train, X_val = X.iloc[tr_id, :], X.iloc[vl_id, :]
    y_train, y_val = y.iloc[tr_id], y.iloc[vl_id]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    
    model_lgb = lgb.train(params=params,
                          train_set=lgb_train,
                          valid_sets=lgb_val,
                          num_boost_round=100000,
                          early_stopping_rounds=200,
                          verbose_eval=1000)
    
    pred_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    pred_lgb = pd.DataFrame(pred_lgb)
    
    # 予測値を横に連結していく
    # Concatenate the predictions horizontally
    preds_lgb = pd.concat([preds_lgb, pred_lgb], axis=1)

               KFold1




Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.66267
[2000]	valid_0's binary_logloss: 0.6534
[3000]	valid_0's binary_logloss: 0.649697
[4000]	valid_0's binary_logloss: 0.647589
[5000]	valid_0's binary_logloss: 0.646081
[6000]	valid_0's binary_logloss: 0.645023
[7000]	valid_0's binary_logloss: 0.64428
[8000]	valid_0's binary_logloss: 0.643775
[9000]	valid_0's binary_logloss: 0.643349
[10000]	valid_0's binary_logloss: 0.643014
[11000]	valid_0's binary_logloss: 0.642874
Early stopping, best iteration is:
[10915]	valid_0's binary_logloss: 0.642865
               KFold2
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.662305
[2000]	valid_0's binary_logloss: 0.653207
[3000]	valid_0's binary_logloss: 0.649285
[4000]	valid_0's binary_logloss: 0.647021
[5000]	valid_0's binary_logloss: 0.645536
[6000]	valid_0's binary_logloss: 0.644488
[7000]	valid_0's binary_logloss: 0.643757
[8000]	valid_0's bin

In [None]:
preds_lgb

Unnamed: 0,0,0.1,0.2,0.3,0.4
0,0.461910,0.465743,0.487420,0.477820,0.507426
1,0.418687,0.361823,0.358026,0.432135,0.386081
2,0.351371,0.299335,0.275735,0.305091,0.265294
3,0.406725,0.357541,0.420258,0.404713,0.417875
4,0.792505,0.813922,0.846070,0.801515,0.823104
...,...,...,...,...,...
5995,0.336535,0.353546,0.319175,0.324341,0.284406
5996,0.756439,0.754828,0.763791,0.775818,0.763886
5997,0.527272,0.516126,0.489099,0.511575,0.468163
5998,0.608665,0.569634,0.599885,0.614017,0.582678


In [None]:
label = preds_lgb.mean(axis=1)
label

0       0.480064
1       0.391350
2       0.299365
3       0.401422
4       0.815423
          ...   
5995    0.323601
5996    0.762952
5997    0.502447
5998    0.594976
5999    0.655820
Length: 6000, dtype: float64

# Predict

In [None]:
submission_df["is_laugh"] = label

submission_df.head()

Unnamed: 0,id,is_laugh
0,rfdjcfsqq,0.480064
1,tsgqmfpef,0.39135
2,owjcthkz2,0.299365
3,rvgaocjyy,0.401422
4,uxtwu5i69,0.815423


In [None]:
submission_df.to_csv(('/content/drive/MyDrive/nishika/sub.csv'), index=False)