In [4]:
import os
from glob import glob
import re
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

# Bert
from sentence_transformers import SentenceTransformer

# Catboost for regression
import catboost

### Dataset

In [None]:
df = pd.read_csv('comedy-news-tg-dataset/marked/full_dataset.tsv', sep='\t')

In [86]:
# Categorical target
df['good_punch'] = (df.mark >= 5).astype(int)
df['good_punch'].value_counts()

0    2989
1     643
Name: good_punch, dtype: int64

In [87]:
# Is_valid is_test
df['is_valid'] = 0
df.loc[df.setup == 'Balenciaga выпустил кроссовки на каблуках почти за 100 тысяч рублей', 'is_valid'] = 1
df['is_test'] = 0
df.loc[df.setup == 'Клип на «Baby Shark» стал первым роликом на ютьюбе, набравшим 10 млрд просмотров', 'is_test'] = 1

In [88]:
# Length
df['punch_len'] = df.punch.apply(len)
df['setup_len'] = df.setup.apply(len)

### Generate embeddings

In [3]:
model = SentenceTransformer('models/DeepPavlov_rubert-base-cased-sentence/')



In [4]:
# Embeddings for punches
df['punch_embedding'] = df['punch'].progress_apply(model.encode)

100%|██████████| 3632/3632 [04:39<00:00, 12.98it/s]


In [5]:
# Embeddings for setups
setup_embed_df = pd.DataFrame(df.setup.unique(), columns=['setup'])
setup_embed_df['setup_embedding'] = setup_embed_df['setup'].progress_apply(model.encode)

100%|██████████| 18/18 [00:01<00:00, 12.26it/s]


In [6]:
df = df.merge(setup_embed_df)

In [89]:
# Split embeddings to columns
punch_embeddings = pd.DataFrame(df['punch_embedding'].tolist())
punch_embeddings.columns = [f'punch_emb_{c}' for c in punch_embeddings.columns]

setup_embeddings = pd.DataFrame(df['setup_embedding'].tolist())
setup_embeddings.columns = [f'setup_emb_{c}' for c in setup_embeddings.columns]

embeddings_df = pd.concat([punch_embeddings, setup_embeddings], axis=1)

# Contact to one df
res_df = pd.concat([df, embeddings_df], axis=1)

### Train

In [90]:
# columns
embeddings_columns = embeddings_df.columns.tolist()
len_columns = ['punch_len', 'setup_len']


train_columns = embeddings_columns + len_columns
# target_column = 'mark'
target_column = 'good_punch'
valid_test_columns = ['is_valid', 'is_test']

In [91]:
X_train = res_df.loc[(res_df.is_valid == 0) & (res_df.is_test == 0), train_columns]
y_train = res_df.loc[(res_df.is_valid == 0) & (res_df.is_test == 0), target_column]
train_pool = catboost.Pool(X_train, y_train)

X_valid = res_df.loc[res_df.is_valid == 1, train_columns]
y_valid = res_df.loc[res_df.is_valid == 1, target_column]
valid_pool = catboost.Pool(X_valid, y_valid)

X_test = res_df.loc[res_df.is_test == 1, train_columns]
y_test = res_df.loc[res_df.is_test == 1, target_column]
test_pool = catboost.Pool(X_test, y_test)

In [95]:
mark_model = catboost.CatBoostClassifier(iterations=3000)

In [97]:
mark_model.fit(
    train_pool,
    eval_set=valid_pool,
    verbose=100
)

Learning rate set to 0.026358
0:	learn: 0.6794310	test: 0.6810012	best: 0.6810012 (0)	total: 63.8ms	remaining: 3m 11s
100:	learn: 0.4120680	test: 0.5008359	best: 0.4994377 (95)	total: 4.22s	remaining: 2m 1s
200:	learn: 0.3582975	test: 0.5030154	best: 0.4994377 (95)	total: 8.31s	remaining: 1m 55s
300:	learn: 0.3135026	test: 0.5096922	best: 0.4994377 (95)	total: 12.5s	remaining: 1m 52s
400:	learn: 0.2703955	test: 0.5175338	best: 0.4994377 (95)	total: 16.8s	remaining: 1m 48s
500:	learn: 0.2304185	test: 0.5315628	best: 0.4994377 (95)	total: 21.1s	remaining: 1m 45s
600:	learn: 0.1968304	test: 0.5381262	best: 0.4994377 (95)	total: 25.4s	remaining: 1m 41s
700:	learn: 0.1690263	test: 0.5433157	best: 0.4994377 (95)	total: 29.8s	remaining: 1m 37s
800:	learn: 0.1460404	test: 0.5477343	best: 0.4994377 (95)	total: 34.2s	remaining: 1m 33s
900:	learn: 0.1262621	test: 0.5544544	best: 0.4994377 (95)	total: 38.6s	remaining: 1m 29s
1000:	learn: 0.1104314	test: 0.5640705	best: 0.4994377 (95)	total: 43s	re

<catboost.core.CatBoostClassifier at 0x1432e49a0>