In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import sys
import os
import re
import random

from time import time
from tqdm import tqdm

from contextlib import contextmanager
import lightgbm as lgb

In [None]:
#上限表示数を拡張
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 200)

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [None]:
INPUT = "/content/drive/MyDrive/nishika/" # 所望のディレクトリに変更してください。
train_df = pd.read_csv(os.path.join(INPUT, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT, "test.csv"))
submission_df = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

In [None]:
df_1_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_SwinTransformerLarge384_V2Large256.csv")
df_1_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_SwinTransformerLarge384_V2Large256.csv")

In [None]:
df_2_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efb2_rinnabase.csv")
df_2_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efb2_rinnabase.csv")

In [None]:
df_2_tr = df_2_tr.drop(df_2_tr.columns[0:352], axis=1)

In [None]:
df_2_ts = df_2_ts.drop(df_2_ts.columns[0:352], axis=1)

In [None]:
df_3_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_swinv2base256_ginza510electra.csv")
df_3_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_swinv2base256_ginza510electra.csv")

In [None]:
df_4_tr = pd.read_csv("/content/drive/MyDrive/nishika/output/train_efv2b2_resnet152_jaginza.csv")
df_4_ts = pd.read_csv("/content/drive/MyDrive/nishika/output/test_efv2b2_resnet152_jaginza.csv")

In [None]:
df_4_tr = df_4_tr.drop(df_4_tr.columns[0:865], axis=1)

In [None]:
df_4_ts = df_4_ts.drop(df_4_ts.columns[0:865], axis=1)

In [None]:
#swinL384,swinV2L256,Roberta-text,efb2
df_tmp_tr_1 = pd.concat([df_1_tr,df_2_tr],axis=1)
df_tmp_ts_1 = pd.concat([df_1_ts,df_2_ts],axis=1)

In [None]:
#ginza510-electra-base-text,swinv2base256
df_tmp_tr_2 = pd.concat([df_tmp_tr_1,df_3_tr],axis=1)
df_tmp_ts_2 = pd.concat([df_tmp_ts_1,df_3_ts],axis=1)

In [None]:
#efficientnetv2b2,resnet152,jaginza(ginza_electra_text)
df_tmp_tr_3 = pd.concat([df_tmp_tr_2,df_4_tr],axis=1)
df_tmp_ts_3 = pd.concat([df_tmp_ts_2,df_4_ts],axis=1)

In [None]:
train_emb = df_tmp_tr_3.drop(["is_laugh","odai_photo_file_name"], axis=1)

In [None]:
test_emb = df_tmp_ts_3.drop(["odai_photo_file_name"], axis=1)

In [None]:
df_5_tr = pd.read_csv("/content/drive/MyDrive/nishika/embeded/embedding_train_rinna_cloob.csv")
df_5_ts = pd.read_csv("/content/drive/MyDrive/nishika/embeded/embedding_test_rinna_cloob.csv")

In [None]:
print(df_5_ts)

      cloob-text0.1  cloob-text1.1  cloob-text2.1  cloob-text3.1  \
0          0.731016       0.591524       1.119229      -0.531281   
1          0.586286       0.510096       1.794982       1.084972   
2         -0.783172       2.442905       0.121229      -1.192899   
3         -0.888486       2.333045       0.877130       0.354861   
4         -0.780083       2.221503       1.497302       1.850881   
...             ...            ...            ...            ...   
5995      -0.537440      -0.608070      -0.245911       1.885879   
5996      -1.632397      -1.875804       2.901645       1.835690   
5997       0.717333       1.035625       1.122044       0.137252   
5998      -0.446729       1.990634      -1.432687       0.634622   
5999       0.817545      -1.570782       2.544377       0.903271   

      cloob-text4.1  cloob-text5.1  cloob-text6.1  cloob-text7.1  \
0          3.633608       0.102909      -2.184197       1.011907   
1          2.962992      -0.872106      -2.8705

In [None]:
df_5_tr = df_5_tr.drop(["Unnamed: 0","odai_photo_file_name","id", "text","is_laugh","img_path"], axis=1)
df_5_ts = df_5_ts.drop(["Unnamed: 0","odai_photo_file_name","id", "text","img_path"], axis=1)

In [None]:
df_5_tr = df_5_tr.drop(df_5_tr.columns[0:1024], axis=1)
df_5_ts = df_5_ts.drop(df_5_ts.columns[0:1024], axis=1)

In [None]:
train_tmp = pd.concat([train_emb,df_5_tr],axis=1)
test_tmp = pd.concat([test_emb,df_5_ts],axis=1)

In [None]:
print(train_tmp)

In [None]:
train = pd.concat([train_tmp,train_df],axis=1)
test = pd.concat([test_tmp,test_df],axis=1)

In [None]:
# 学習データと評価データに分割します
train, valid = train_test_split(train, test_size=0.2, random_state=42, stratify=train["is_laugh"])

train_y = train["is_laugh"]
train_x = train.drop(["odai_photo_file_name","is_laugh","id","text"], axis=1)

valid_y = valid["is_laugh"]
valid_x = valid.drop(["odai_photo_file_name","is_laugh","id","text"], axis=1)

test_x = test.drop(["odai_photo_file_name","id","text","img_path"], axis=1)

In [None]:
print(train_x.shape)
print(valid_x.shape)

In [None]:
train_x = train_x.T.drop_duplicates().T
valid_x = valid_x.T.drop_duplicates().T 
test_x = test_x.T.drop_duplicates().T 

# Model

In [None]:
lgbm_params = {  
    "n_estimators": 20000,
    "objective": 'binary',
    "learning_rate": 0.05,
    "num_leaves": 32,
    "random_state": 71,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .8,
    "reg_lambda": 5,
    "max_depth":5,
    }

lgtrain = lgb.Dataset(train_x, train_y)
lgvalid = lgb.Dataset(valid_x, valid_y)

lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    num_boost_round=10000,
    valid_sets=[lgtrain, lgvalid],
    valid_names=['train','valid'],
    early_stopping_rounds=100,
    verbose_eval=100
)

In [None]:
# 特徴量の重要度を可視化。
lgb.plot_importance(lgb_clf, figsize=(12,8), max_num_features=50, importance_type='gain')
plt.tight_layout()
plt.show()

In [None]:
# 評価指標はlog lossだが、accuracyも見てみる

val_pred = lgb_clf.predict(valid_x, num_iteration=lgb_clf.best_iteration)
val_pred_max = np.round(lgb_clf.predict(valid_x)).astype(int)  # クラスに分類
accuracy = sum(valid_y == val_pred_max) / len(valid_y)
print(accuracy)

In [None]:
_conf_options = {"normalize": None,}
_plot_options = {
        "cmap": "Blues",
        "annot": True
    }

conf = confusion_matrix(y_true=valid_y,
                        y_pred=val_pred_max,
                        **_conf_options)

fig, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(conf, ax=ax, **_plot_options)
ax.set_ylabel("Label")
ax.set_xlabel("Predict")

# Predict

In [None]:
test_pred = lgb_clf.predict(test_x, num_iteration=lgb_clf.best_iteration)

In [None]:
submission_df["is_laugh"] = test_pred
submission_df.head()

In [None]:
submission_df.to_csv(('sub.csv'), index=False)

実際に提出して、スコアを確認してみましょう。  精度向上に向けて様々なアイディアがあるかと思いますので、ぜひいろいろとトライしていただければと思います！

- 異なる学習済みモデルでの特徴量化
- 画像の状況とボケての文章との解離具合を測定する
- 説明文口調とセリフ口調の分類をしてみる。
- 画像に何が写っているかを検出し、特徴量に加えてみる（人が写っている。動物が写っている）


https://lab.mo-t.com/blog/kaggle-shopee
前処理については今回は気にしない。最終的なLightgbmにぶっこむ特徴量を増やして選別する方針でいく。
エンベッディングした結果は保存する。
最初は画像は画像、テキストはテキストでエンベッディングする。
画像のArcFaceちょっとめんどい、実装できそうならやる
knn(k=50)を使って、画像、テキストの分類自体はする。
https://www.kaggle.com/code/jyotmakadiya/shopee-groupkfolds-effnetb1-and-tfidf-or-bert

基本はエンベッディングしたものを使う。
プラスで、特徴量を足してみる。
乖離具合ってどうやって測定するのか
・長文？なんかキーになる話がある？画像と関係？わからん
・ボケのデザインパターン/ジョナサン・ハイト
・既知未知判定ってどうやってやるんだ・・・
・ポジネガ距離はおもろい。
あえて正しいこととは逆なことをして、許されるライン的な話。
→単純にえぐい言葉やえぐさを分類？なんかいいのないかな。
説明文とセリフ文→なんかいい楽な分類ないかな、なかったら頑張る。
画像に何が写っているか→うーん、手動しかないかも。