In [None]:
#!conda install -c conda-forge lightgbm
#!pip install optuna

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import sys
import os
import re
import random

from time import time
from tqdm import tqdm

from contextlib import contextmanager
import lightgbm as lgb
#import optuna.integration.lightgbm as opt_lgb

In [None]:
#上限表示数を拡張
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [None]:
INPUT = "/content/drive/MyDrive/nishika/" # 所望のディレクトリに変更してください。
train_df = pd.read_csv(os.path.join(INPUT, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT, "test.csv"))
submission_df = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

In [None]:
df_1_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_SwinTransformerLarge384_V2Large256.csv")
df_1_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_SwinTransformerLarge384_V2Large256.csv")

In [None]:
df_2_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efb2_rinnabase.csv")
df_2_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efb2_rinnabase.csv")

In [None]:
df_2_tr = df_2_tr.drop(df_2_tr.columns[0:352], axis=1)

In [None]:
df_2_ts = df_2_ts.drop(df_2_ts.columns[0:352], axis=1)

In [None]:
df_3_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_swinv2base256_ginza510electra.csv")
df_3_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_swinv2base256_ginza510electra.csv")

In [None]:
df_4_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efv2b2_resnet152_jaginza.csv")
df_4_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efv2b2_resnet152_jaginza.csv")

In [None]:
df_4_tr = df_4_tr.drop(df_4_tr.columns[0:865], axis=1)

In [None]:
df_4_ts = df_4_ts.drop(df_4_ts.columns[0:865], axis=1)

In [None]:
#swinL384,swinV2L256,Roberta-text,efb2
df_tmp_tr_1 = pd.concat([df_1_tr,df_2_tr],axis=1)
df_tmp_ts_1 = pd.concat([df_1_ts,df_2_ts],axis=1)

In [None]:
#ginza510-electra-base-text,swinv2base256
df_tmp_tr_2 = pd.concat([df_tmp_tr_1,df_3_tr],axis=1)
#df_tmp_tr_2 = pd.concat([df_1_tr,df_3_tr],axis=1)
df_tmp_ts_2 = pd.concat([df_tmp_ts_1,df_3_ts],axis=1)
#df_tmp_ts_2 = pd.concat([df_1_ts,df_3_ts],axis=1)

In [None]:
#efficientnetv2b2,resnet152,jaginza(ginza_electra_text)
df_tmp_tr_3 = pd.concat([df_tmp_tr_2,df_4_tr],axis=1)
df_tmp_ts_3 = pd.concat([df_tmp_ts_2,df_4_ts],axis=1)

In [None]:
train_tmp = df_tmp_tr_3.drop(["is_laugh","odai_photo_file_name"], axis=1)

In [None]:
test_tmp = df_tmp_ts_3.drop(["odai_photo_file_name"], axis=1)

In [None]:
train = pd.concat([train_tmp,train_df],axis=1)
test = pd.concat([test_tmp,test_df],axis=1)

In [None]:
label = pd.read_csv("/content/drive/MyDrive/nishika/sub/sub70.csv")

In [None]:
append_label = label[(label["is_laugh"]<0.15)|(label["is_laugh"]>0.88)]

In [None]:
append_label.loc[ append_label['is_laugh']>=0.5, 'is_laugh' ] = 1
append_label.loc[ append_label['is_laugh']<0.5, 'is_laugh' ] = 0 

In [None]:
print(append_label)

             id  is_laugh
345   i65papeoe       0.0
1378  fkjzl3crg       1.0
1624  euaa8geth       1.0
2521  rautfoprm       1.0
2614  pash9zviv       0.0
4354  2yyerhxia       0.0
4750  kmuhck4jh       0.0
5423  dvbyjtzoh       0.0
5913  nohxspotb       1.0


In [None]:
test = test.reset_index()

In [None]:
append_label = append_label.reset_index()

In [None]:
append_label["is_laugh"] = append_label["is_laugh"].astype(int)

In [None]:
new_label = append_label.rename(columns={"id":"ap_id","is_laugh":"new_laugh"})

In [None]:
new_X = pd.merge(new_label,test,on="index",how="left")

In [None]:
new_X = test.drop(["id","odai_photo_file_name","text"], axis=1)

In [None]:
print(new_X)

In [None]:
new_X["text_len"] = new_X["text"].str.len()

In [None]:
new_X = new_X.rename(columns={"ap_id":"id","new_laugh":"is_laugh"})

In [None]:
new_X = new_X.drop(["index","id","odai_photo_file_name"], axis=1)

In [None]:
train["text_len"] = train["text"].str.len()
test["text_len"] = test["text"].str.len()

In [None]:
y = train["is_laugh"]
X = train.drop(["id","odai_photo_file_name","is_laugh","text"], axis=1)

new_y = new_X["is_laugh"]
new_X = new_X.drop(["is_laugh","text"],axis=1)

X_test = test.drop(["id","odai_photo_file_name","text"], axis=1)

In [None]:
X = pd.concat([new_X, X], axis=0)
y = pd.concat([new_y,y], axis=0)

In [None]:

X = X.T.drop_duplicates().T
X_test = X_test.T.drop_duplicates().T 


In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
params = {  
    "n_estimators": 20000,
    "objective": 'binary',
    "learning_rate": 0.01,
    'lambda_l1': 8.553323365580143, 
    'lambda_l2': 0.00024389221237366783, 
    'num_leaves': 240, 
    'feature_fraction': 0.6839999999999999, 
    'bagging_fraction': 0.6378550454605715, 
    'bagging_freq': 1,
    "random_state": 71,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .8,
    "reg_lambda": 5,
    "max_depth":9,
    'min_child_samples': 20
    }

In [None]:
# 予測値を格納するdf
# df to store the predicted value
preds_lgb = pd.DataFrame()

for k, (tr_id, vl_id) in enumerate(kf.split(X, y)):
    print("="*50)
    print(f"               KFold{k+1}")
    print("="*50)
    
    X_train, X_val = X.iloc[tr_id, :], X.iloc[vl_id, :]
    y_train, y_val = y.iloc[tr_id], y.iloc[vl_id]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    
    model_lgb = lgb.train(params=params,
                          train_set=lgb_train,
                          valid_sets=lgb_val,
                          num_boost_round=100000,
                          early_stopping_rounds=200,
                          verbose_eval=1000)
    
    pred_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    pred_lgb = pd.DataFrame(pred_lgb)
    
    # 予測値を横に連結していく
    # Concatenate the predictions horizontally
    preds_lgb = pd.concat([preds_lgb, pred_lgb], axis=1)

               KFold1




Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.647683
Early stopping, best iteration is:
[982]	valid_0's binary_logloss: 0.647539
               KFold2
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.640475
Early stopping, best iteration is:
[1249]	valid_0's binary_logloss: 0.64042
               KFold3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[776]	valid_0's binary_logloss: 0.642286
               KFold4
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.647472
Early stopping, best iteration is:
[836]	valid_0's binary_logloss: 0.647192
               KFold5
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's binary_logloss: 0.638598
Early stopping, best iteration is:
[1693]	valid_0's binary_logloss: 0.636884


In [None]:
preds_lgb

Unnamed: 0,0,0.1,0.2,0.3,0.4
0,0.357843,0.318573,0.387343,0.344656,0.267441
1,0.412803,0.392499,0.419610,0.435945,0.394618
2,0.458156,0.402896,0.456634,0.423544,0.420535
3,0.390378,0.381434,0.381609,0.411813,0.410107
4,0.418894,0.429795,0.362525,0.429921,0.371420
...,...,...,...,...,...
5995,0.425047,0.415782,0.447439,0.424421,0.403883
5996,0.450507,0.404503,0.438131,0.436234,0.342133
5997,0.394078,0.397645,0.410586,0.352476,0.348589
5998,0.455239,0.441332,0.484176,0.413892,0.402743


In [None]:
label = preds_lgb.mean(axis=1)
label

0       0.335171
1       0.411095
2       0.432353
3       0.395068
4       0.402511
          ...   
5995    0.423315
5996    0.414301
5997    0.380675
5998    0.439476
5999    0.402337
Length: 6000, dtype: float64

# Predict

In [None]:
submission_df["is_laugh"] = label

submission_df.head()

Unnamed: 0,id,is_laugh
0,rfdjcfsqq,0.335171
1,tsgqmfpef,0.411095
2,owjcthkz2,0.432353
3,rvgaocjyy,0.395068
4,uxtwu5i69,0.402511


In [None]:
submission_df.to_csv(('/content/drive/MyDrive/nishika/sub.csv'), index=False)