In [1]:
#!conda install -c conda-forge lightgbm
#!pip install optuna

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import sys
import os
import re
import random

from time import time
from tqdm import tqdm

from contextlib import contextmanager
import lightgbm as lgb
#import optuna.integration.lightgbm as opt_lgb

In [4]:
#上限表示数を拡張
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 200)

In [5]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [6]:
INPUT = "/content/drive/MyDrive/nishika/" # 所望のディレクトリに変更してください。
train_df = pd.read_csv(os.path.join(INPUT, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT, "test.csv"))
submission_df = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

In [7]:
df_1_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_SwinTransformerLarge384_V2Large256.csv")
df_1_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_SwinTransformerLarge384_V2Large256.csv")

In [8]:
df_2_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efb2_rinnabase.csv")
df_2_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efb2_rinnabase.csv")

In [9]:
df_2_tr = df_2_tr.drop(df_2_tr.columns[0:352], axis=1)

In [10]:
df_2_ts = df_2_ts.drop(df_2_ts.columns[0:352], axis=1)

In [11]:
df_3_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_swinv2base256_ginza510electra.csv")
df_3_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_swinv2base256_ginza510electra.csv")

In [12]:
df_4_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efv2b2_resnet152_jaginza.csv")
df_4_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efv2b2_resnet152_jaginza.csv")

In [13]:
df_4_tr = df_4_tr.drop(df_4_tr.columns[0:865], axis=1)

In [14]:
df_4_ts = df_4_ts.drop(df_4_ts.columns[0:865], axis=1)

In [15]:
#swinL384,swinV2L256,Roberta-text,efb2
df_tmp_tr_1 = pd.concat([df_1_tr,df_2_tr],axis=1)
df_tmp_ts_1 = pd.concat([df_1_ts,df_2_ts],axis=1)

In [16]:
#ginza510-electra-base-text,swinv2base256
df_tmp_tr_2 = pd.concat([df_tmp_tr_1,df_3_tr],axis=1)
#df_tmp_tr_2 = pd.concat([df_1_tr,df_3_tr],axis=1)
df_tmp_ts_2 = pd.concat([df_tmp_ts_1,df_3_ts],axis=1)
#df_tmp_ts_2 = pd.concat([df_1_ts,df_3_ts],axis=1)

In [17]:
#efficientnetv2b2,resnet152,jaginza(ginza_electra_text)
df_tmp_tr_3 = pd.concat([df_tmp_tr_2,df_4_tr],axis=1)
df_tmp_ts_3 = pd.concat([df_tmp_ts_2,df_4_ts],axis=1)

In [18]:
train_tmp = df_tmp_tr_3.drop(["is_laugh","odai_photo_file_name"], axis=1)

In [19]:
test_tmp = df_tmp_ts_3.drop(["odai_photo_file_name"], axis=1)

In [20]:
train = pd.concat([train_tmp,train_df],axis=1)
test = pd.concat([test_tmp,test_df],axis=1)

In [21]:
train["text_len"] = train["text"].str.len()
test["text_len"] = test["text"].str.len()

In [22]:
y = train["is_laugh"]
X = train.drop(["odai_photo_file_name","is_laugh","id","text"], axis=1)

X_test = test.drop(["odai_photo_file_name","id","text"], axis=1)

In [23]:

X = X.T.drop_duplicates().T
X_test = X_test.T.drop_duplicates().T 


# Model

In [24]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [25]:
params = {  
    "n_estimators": 20000,
    "objective": 'binary',
    "learning_rate": 0.01,
    'lambda_l1': 8.553323365580143, 
    'lambda_l2': 0.00024389221237366783, 
    'num_leaves': 240, 
    'feature_fraction': 0.6839999999999999, 
    'bagging_fraction': 0.6378550454605715, 
    'bagging_freq': 1,
    "random_state": 71,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .8,
    "reg_lambda": 5,
    "max_depth":9,
    'min_child_samples': 20
    }

In [26]:
# 予測値を格納するdf
# df to store the predicted value
preds_lgb = pd.DataFrame()

for k, (tr_id, vl_id) in enumerate(kf.split(X, y)):
    print("="*50)
    print(f"               KFold{k+1}")
    print("="*50)
    
    X_train, X_val = X.iloc[tr_id, :], X.iloc[vl_id, :]
    y_train, y_val = y.iloc[tr_id], y.iloc[vl_id]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    
    model_lgb = lgb.train(params=params,
                          train_set=lgb_train,
                          valid_sets=lgb_val,
                          num_boost_round=100000,
                          early_stopping_rounds=200,
                          verbose_eval=1000)
    
    pred_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    pred_lgb = pd.DataFrame(pred_lgb)
    
    # 予測値を横に連結していく
    # Concatenate the predictions horizontally
    preds_lgb = pd.concat([preds_lgb, pred_lgb], axis=1)

               KFold1




Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.642963
Early stopping, best iteration is:
[1128]	valid_0's binary_logloss: 0.642689
               KFold2
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.642951
Early stopping, best iteration is:
[1378]	valid_0's binary_logloss: 0.642057
               KFold3
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.63749
Early stopping, best iteration is:
[1065]	valid_0's binary_logloss: 0.637069
               KFold4
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.643735
Early stopping, best iteration is:
[1466]	valid_0's binary_logloss: 0.64297
               KFold5
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.635652
Early stopping, best iteration is:
[1361]	valid_0's binary_logloss: 0.633888
    

In [27]:
preds_lgb

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8
0,0.423566,0.495273,0.568438,0.414313,0.463123,0.573604,0.427315,0.532491,0.503579
1,0.336148,0.378194,0.405104,0.327592,0.333974,0.408124,0.459992,0.403829,0.363239
2,0.331823,0.329790,0.348196,0.281929,0.280603,0.328949,0.320767,0.288006,0.224300
3,0.444603,0.407810,0.395718,0.377376,0.405860,0.372486,0.414796,0.359601,0.438708
4,0.832556,0.786828,0.797869,0.867819,0.800257,0.856682,0.783809,0.813682,0.825206
...,...,...,...,...,...,...,...,...,...
5995,0.339020,0.303307,0.294857,0.297898,0.317812,0.300478,0.391976,0.357798,0.250543
5996,0.771582,0.777806,0.788575,0.782329,0.758853,0.834688,0.741459,0.758652,0.762338
5997,0.526680,0.556843,0.482768,0.507028,0.508497,0.504150,0.509365,0.507823,0.457717
5998,0.609568,0.579668,0.588485,0.605833,0.555977,0.661311,0.579440,0.583233,0.616364


In [28]:
# 平均を計算して、テストデータに対する疑似ラベルとする
# Calculate the mean and use it as a pseudo labels for the test data

label = preds_lgb.mean(axis=1)
label

0       0.489078
1       0.379577
2       0.303818
3       0.401884
4       0.818301
          ...   
5995    0.317077
5996    0.775142
5997    0.506763
5998    0.597764
5999    0.648832
Length: 6000, dtype: float64

In [29]:
# もともとの学習データX, yにテストデータと疑似ラベルを縦に連結する。
# これを新たな学習データとする
# Concatenate the test data and pseudo labels to the original training data X, y.
# Make this the new training data.

X = pd.concat([X,X_test], axis=0).reset_index(drop=True)
y = pd.concat([y, label], axis=0).reset_index(drop=True)

print("X.shape: ", X.shape)
print("y.shape: ", y.shape)

X.shape:  (30962, 2861)
y.shape:  (30962,)


In [30]:
# 最終予測値を格納するdf
# df to store the final prediction
preds_lgb = pd.DataFrame()

for k, (tr_id, vl_id) in enumerate(kf.split(X, y)):
    print("="*50)
    print(f"               KFold{k+1}")
    print("="*50)
    
    X_train, X_val = X.iloc[tr_id, :], X.iloc[vl_id, :]
    y_train, y_val = y.iloc[tr_id], y.iloc[vl_id]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    
    model_lgb = lgb.train(params=params,
                          train_set=lgb_train,
                          valid_sets=lgb_val,
                          num_boost_round=100000,
                          early_stopping_rounds=200,
                          verbose_eval=1000)
    
    pred_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    pred_lgb = pd.DataFrame(pred_lgb)
    preds_lgb = pd.concat([preds_lgb, pred_lgb], axis=1)

               KFold1




Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.635025
Early stopping, best iteration is:
[807]	valid_0's binary_logloss: 0.634738
               KFold2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[634]	valid_0's binary_logloss: 0.648207
               KFold3
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.646822
Early stopping, best iteration is:
[901]	valid_0's binary_logloss: 0.64624
               KFold4
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.635261
Early stopping, best iteration is:
[1134]	valid_0's binary_logloss: 0.634124
               KFold5
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[775]	valid_0's binary_logloss: 0.637234
               KFold6
Training until validation scores don't improve for 200 rounds.
[1000]

# Predict

In [31]:
#　予測値の平均を計算して、最終的な予測値とする
# Calculate the average of the predictions to get the final prediction.
pred = preds_lgb.mean(axis=1)
submission_df["is_laugh"] = pred

submission_df.head()

Unnamed: 0,id,is_laugh
0,rfdjcfsqq,0.739519
1,tsgqmfpef,0.703844
2,owjcthkz2,0.697094
3,rvgaocjyy,0.739904
4,uxtwu5i69,0.887635


In [32]:
submission_df.to_csv(('/content/drive/MyDrive/nishika/sub.csv'), index=False)