In [None]:
import gc

import numpy as np
import pandas as pd
import spacy
import textdescriptives as td
import textstat
from catboost import CatBoostRegressor
from pandarallel import pandarallel
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from textacy.text_stats import basics, counts, diversity, readability
from tqdm import tqdm
from tqdm.notebook import tqdm
from transformers import pipeline

from poetry_ranking.utils import get_sentence_embedding_sbert

pandarallel.initialize(progress_bar=True, nb_workers=8)
spacy_nlp = spacy.load("ru_core_news_lg")
textstat.set_lang("ru")

Loading models...

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


OK
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.




In [None]:
def get_textstat_features(text: str):
    textstat_features = {
        f"textstat_{key}": value
        for key, value in {
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "gunning_fog": textstat.gunning_fog(text),
            "smog_index": textstat.smog_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
            "coleman_liau_index": textstat.coleman_liau_index(text),
            "linsear_write_formula": textstat.linsear_write_formula(text),
            "dale_chall_readability_score": textstat.dale_chall_readability_score(text),
            "text_standard": textstat.text_standard(text),
            "spache_readability": textstat.spache_readability(text),
            "mcalpine_eflaw": textstat.mcalpine_eflaw(text),
            "reading_time": textstat.reading_time(text, ms_per_char=14.69),
            "syllable_count": textstat.syllable_count(text),
            "lexicon_count": textstat.lexicon_count(text, removepunct=True),
            "sentence_count": textstat.sentence_count(text),
            "char_count": textstat.char_count(text, ignore_spaces=True),
            "letter_count": textstat.letter_count(text, ignore_spaces=True),
            "polysyllabcount": textstat.polysyllabcount(text),
            "monosyllabcount": textstat.monosyllabcount(text),
            "difficult_words": textstat.difficult_words(text),
        }.items()
    }
    return textstat_features


def get_textacy_features(text: str):
    doc = spacy_nlp(text)

    stats = {
        # Basic Statistics
        "n_sents": basics.n_sents(doc),
        "n_words": basics.n_words(doc),
        "n_unique_words": basics.n_unique_words(doc),
        "n_chars_per_word_mean": np.mean(basics.n_chars_per_word(doc)).item(),
        "n_chars": basics.n_chars(doc),
        "n_long_words": basics.n_long_words(doc),
        "n_syllables_per_word_mean": np.mean(basics.n_syllables_per_word(doc)).item(),
        "n_syllables": basics.n_syllables(doc),
        "n_monosyllable_words": basics.n_monosyllable_words(doc),
        "n_polysyllable_words": basics.n_polysyllable_words(doc),
        "entropy": basics.entropy(doc),
        # Readability Measures
        "readability_automated_readability_index": readability.automated_readability_index(
            doc
        ),
        "readability_coleman_liau_index": readability.coleman_liau_index(doc),
        "readability_flesch_kincaid_grade_level": readability.flesch_kincaid_grade_level(
            doc
        ),
        "readability_flesch_reading_ease": readability.flesch_reading_ease(doc),
        "readability_gunning_fog_index": readability.gunning_fog_index(doc),
        "readability_smog_index": readability.smog_index(doc),
        "readability_lix": readability.lix(doc),
        # Diversity Measures
        "diversity_ttr": diversity.ttr(doc),
        "diversity_log_ttr": diversity.log_ttr(doc),
        "diversity_segmented_ttr": diversity.segmented_ttr(doc),
        "diversity_mtld": diversity.mtld(doc),
        "diversity_hdd": diversity.hdd(doc).item(),
        # Counts
        "morph_counts": counts.morph(doc),
        "pos_counts": counts.pos(doc),
        "tag_counts": counts.tag(doc),
        "dep_counts": counts.dep(doc),
    }

    for category, counts_ in stats["morph_counts"].items():
        for subcategory, count in counts_.items():
            stats[f"morph_counts_{category}_{subcategory}"] = count

    # Process pos_counts
    for pos, count in stats["pos_counts"].items():
        stats[f"pos_counts_{pos}"] = count

    # Process tag_counts
    for tag, count in stats["tag_counts"].items():
        stats[f"tag_counts_{tag}"] = count

    # Process dep_counts
    for dep, count in stats["dep_counts"].items():
        stats[f"dep_counts_{dep}"] = count

    stats.pop("morph_counts")
    stats.pop("pos_counts")
    stats.pop("tag_counts")
    stats.pop("dep_counts")

    textacy_features = {f"textacy_{key}": value for key, value in stats.items()}

    return textacy_features


def make_features(df):
    df = df.copy()

    df_td = td.extract_metrics(text=df["output_text"], lang="ru", metrics=None)
    df_td = df_td.add_prefix("textdescriptives_")
    df = pd.concat([df, df_td], axis=1)

    df_textstat = df["output_text"].parallel_apply(get_textstat_features).apply(pd.Series)
    df = pd.concat([df, df_textstat], axis=1)

    df_textacy = df["output_text"].parallel_apply(get_textacy_features).apply(pd.Series)
    df = pd.concat([df, df_textacy], axis=1)

    return df

In [None]:
regenerate = False

if regenerate:
    df_train = pd.read_csv("data/raw/poetry_data_train.zip")
    df_test = pd.read_csv("data/raw/poetry_data_test.zip")

    df_train_features = make_features(df_train)
    df_train_features.to_csv(
        "data/text_features/df_train_text_features.zip",
        index=False,
        compression={
            "method": "zip",
            "compresslevel": 9,
            "archive_name": "df_train_text_features.csv",
        },
    )

    df_test_features = make_features(df_test)
    df_test_features.to_csv(
        "data/text_features/df_test_text_features.zip",
        index=False,
        compression={
            "method": "zip",
            "compresslevel": 9,
            "archive_name": "df_test_text_features.csv",
        },
    )

else:
    df_train_features = pd.read_csv("data/text_features/df_train_text_features.zip")
    df_test_features = pd.read_csv("data/text_features/df_test_text_features.zip")

In [None]:
regenerate_embeddings = False
if regenerate_embeddings:
    tqdm.pandas()

    df_train_embeddings = pd.read_csv("data/raw/poetry_data_train.zip")[["output_text"]]
    df_test_embeddings = pd.read_csv("data/raw/poetry_data_test.zip")[["output_text"]]

    df_train_embeddings = df_train_embeddings.add_prefix("sbert_embedding_")
    df_test_embeddings = df_test_embeddings.add_prefix("sbert_embedding_")

    df_train_embeddings["sbert_embeddings"] = df_train_embeddings[
        "sbert_embedding_output_text"
    ].progress_apply(lambda x: get_sentence_embedding_sbert(x).tolist())
    df_train_embeddings.to_csv(
        "data/bert_embeddings/df_train_sbert_embeddings.zip",
        index=False,
        compression={
            "method": "zip",
            "compresslevel": 9,
            "archive_name": "df_train_sbert_embeddings.csv",
        },
    )

    df_test_embeddings["sbert_embeddings"] = df_test_embeddings[
        "sbert_embedding_output_text"
    ].progress_apply(lambda x: get_sentence_embedding_sbert(x).tolist())
    df_test_embeddings.to_csv(
        "data/bert_embeddings/df_test_sbert_embeddings.zip",
        index=False,
        compression={
            "method": "zip",
            "compresslevel": 9,
            "archive_name": "df_test_sbert_embeddings.csv",
        },
    )
else:
    df_train_embeddings = pd.read_csv(
        "data/bert_embeddings/df_train_sbert_embeddings.zip"
    )
    df_test_embeddings = pd.read_csv("data/bert_embeddings/df_test_sbert_embeddings.zip")

In [None]:
regenerate_sentiment = False

if regenerate_sentiment:
    tqdm.pandas()
    sentiment_model = pipeline(
        model="seara/rubert-tiny2-russian-sentiment",
        padding=True,
        truncation=True,
        device="cuda",
    )

    df_train_sentiment = pd.read_csv("data/raw/poetry_data_train.zip")[["output_text"]]
    df_test_sentiment = pd.read_csv("data/raw/poetry_data_test.zip")[["output_text"]]

    df_train_sentiment = df_train_sentiment.add_prefix("sentiment_")
    df_test_sentiment = df_test_sentiment.add_prefix("sentiment_")

    def get_sentiment(text):
        result = sentiment_model(text)[0]
        return result["label"]

    df_train_sentiment["sentiment"] = df_train_sentiment[
        "sentiment_output_text"
    ].progress_apply(lambda x: get_sentiment(x))

    df_train_sentiment.to_csv(
        "data/sentiment/df_train_sentiment.zip",
        index=False,
        compression={
            "method": "zip",
            "compresslevel": 9,
            "archive_name": "df_train_sentiment.csv",
        },
    )

    df_test_sentiment["sentiment"] = df_test_sentiment[
        "sentiment_output_text"
    ].progress_apply(lambda x: get_sentiment(x))

    df_test_sentiment.to_csv(
        "data/sentiment/df_test_sentiment.zip",
        index=False,
        compression={
            "method": "zip",
            "compresslevel": 9,
            "archive_name": "df_test_sentiment.csv",
        },
    )
else:
    df_train_sentiment = pd.read_csv("data/sentiment/df_train_sentiment.zip")
    df_test_sentiment = pd.read_csv("data/sentiment/df_test_sentiment.zip")

In [None]:
df_train = pd.concat([df_train_features, df_train_embeddings, df_train_sentiment], axis=1)
df_test = pd.concat([df_test_features, df_test_embeddings, df_test_sentiment], axis=1)


del df_train_features
del df_train_embeddings
del df_train_sentiment

del df_test_features
del df_test_embeddings
del df_test_sentiment

gc.collect()

228

In [None]:
def downcast_datatypes(df, cat_cols):
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory consumed (in MB) before downsizing: ", start_mem)

    fcols = df.select_dtypes("float").columns
    icols = df.select_dtypes("integer").columns

    df[fcols] = df[fcols].apply(pd.to_numeric, downcast="float")
    df[icols] = df[icols].apply(pd.to_numeric, downcast="integer")

    for col in cat_cols:
        df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory consumed (in MB) after downsizing: ", end_mem)
    print(f"Memory usage decreased by ({100 * (start_mem - end_mem) / start_mem:.1f}% )")

    return df


df_train = downcast_datatypes(df_train, cat_cols=["genre", "sentiment"])
df_test = downcast_datatypes(df_test, cat_cols=["genre", "sentiment"])

Memory consumed (in MB) before downsizing:  251.43099975585938
Memory consumed (in MB) after downsizing:  125.25503158569336
Memory usage decreased by (50.2% )
Memory consumed (in MB) before downsizing:  62.85784435272217
Memory consumed (in MB) after downsizing:  31.215909957885742
Memory usage decreased by (50.3% )


In [None]:
def preprocess_df(df, predict=False):
    if not predict:
        df = df.dropna(axis=1)
        df = df.loc[:, df.nunique() > 2]
        df = df.drop("url", axis=1, errors="ignore")
    df["textstat_text_standard"] = (
        df["textstat_text_standard"]
        .str.extractall(r"(\d+)")
        .unstack()
        .astype(float)
        .mean(axis=1)
    )

    if not predict:
        df = df.drop("textdescriptives_passed_quality_check", axis=1, errors="ignore")
        temp_genre_value_counts = df["genre"].value_counts()
        df["genre"] = df["genre"].apply(
            lambda x: x if temp_genre_value_counts[x] >= 50 else "Другое"
        )
        df = df.drop(
            ["sbert_embedding_output_text", "sentiment_output_text"],
            axis=1,
            errors="ignore",
        )
        df = df.T.drop_duplicates().T

        df["sbert_embeddings"] = df["sbert_embeddings"].parallel_apply(
            lambda x: eval(x)[0]
        )

    if predict:
        df["sbert_embeddings"] = df["sbert_embeddings"].parallel_apply(lambda x: x[0])

    embedding_df = pd.DataFrame(df["sbert_embeddings"].tolist(), index=df.index)

    embedding_df.columns = [f"embedding_{i+1}" for i in range(embedding_df.shape[1])]

    df = df.drop(columns=["sbert_embeddings"]).join(embedding_df)

    return df


df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)[df_train.columns]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=17300), Label(value='0 / 17300')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4325), Label(value='0 / 4325'))), …

In [None]:
def print_metrics(model, X_val, y_val, prefix):
    y_pred = model.predict(X_val)
    mse = root_mean_squared_error(y_val, y_pred)
    print(f"{prefix.upper()}: Root mean Squared Error: {mse}")
    kendall_tau, _ = kendalltau(y_val, y_pred)
    print(f"{prefix.upper()}: Kendall's Tau: {kendall_tau}")
    kendall_tau, _ = spearmanr(y_val, y_pred)
    print(f"{prefix.upper()}: Spearmanr: {kendall_tau}")


class KendallTauMetric:
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        approx = approxes[0]
        tau, _ = kendalltau(target, approx)
        error_sum = tau
        weight_sum = 1.0
        return error_sum, weight_sum

In [None]:
X = df_train.drop(["views", "genre", "rating"], axis=1)
y = df_train["rating"]

X_test = df_test.drop(["views", "genre", "rating"], axis=1)
y_test = df_test["rating"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

top_model = CatBoostRegressor(eval_metric=KendallTauMetric(), verbose=0)
top_model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    text_features=["output_text"],
    cat_features=["sentiment"],
    early_stopping_rounds=50,
)

print_metrics(top_model, X_val, y_val, prefix="val")
print_metrics(top_model, X_test, y_test, prefix="test")

  _check_train_params(params)


VAL: Root mean Squared Error: 31.259051561009013
VAL: Kendall's Tau: 0.20418599881519792
VAL: Spearmanr: 0.27505569185643514
TEST: Root mean Squared Error: 32.82181614756902
TEST: Kendall's Tau: 0.2064323571740577
TEST: Spearmanr: 0.27833595332250965


In [None]:
X = df_train.drop(["views", "genre", "rating"], axis=1)
y = df_train["views"]

X_test = df_test.drop(["views", "genre", "rating"], axis=1)
y_test = df_test["views"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(eval_metric=KendallTauMetric(), verbose=0)
model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    text_features=["output_text"],
    cat_features=["sentiment"],
    early_stopping_rounds=50,
)
print_metrics(model, X_val, y_val, prefix="val")
print_metrics(model, X_test, y_test, prefix="test")

  _check_train_params(params)


VAL: Root mean Squared Error: 389.95554565117465
VAL: Kendall's Tau: 0.0935416419111623
VAL: Spearmanr: 0.13935603376280564
TEST: Root mean Squared Error: 394.55398695048194
TEST: Kendall's Tau: 0.10004089731549336
TEST: Spearmanr: 0.14882060984358098


In [None]:
def rating_views_target(rating, views):
    if views == 0:
        return 0
    return views + np.exp(1 + rating / (rating + views)) * rating


X = df_train.drop(["views", "genre", "rating"], axis=1)
y = df_train.apply(lambda row: rating_views_target(row["rating"], row["views"]), axis=1)

X_test = df_test.drop(["views", "genre", "rating"], axis=1)
y_test = df_test.apply(
    lambda row: rating_views_target(row["rating"], row["views"]), axis=1
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostRegressor(eval_metric=KendallTauMetric(), verbose=0)
model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    text_features=["output_text"],
    cat_features=["sentiment"],
    early_stopping_rounds=50,
)
print_metrics(model, X_val, y_val, prefix="val")
print_metrics(model, X_test, y_test, prefix="test")

  _check_train_params(params)


VAL: Root mean Squared Error: 426.7664170778184
VAL: Kendall's Tau: 0.1097880220287051
VAL: Spearmanr: 0.16429213197740228
TEST: Root mean Squared Error: 433.2314645951856
TEST: Kendall's Tau: 0.11153520708407551
TEST: Spearmanr: 0.16677947306110338


In [None]:
importances = top_model.get_feature_importance()
feature_importance_df = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": importances}
).sort_values(by="Importance", ascending=False)
feature_importance_df.head(10)

Unnamed: 0,Feature,Importance
0,output_text,29.473032
79,textstat_difficult_words,2.356093
83,textacy_n_chars,1.58814
175,embedding_71,1.399563
76,textstat_letter_count,1.3394
245,embedding_141,1.219181
45,textdescriptives_top_ngram_chr_fraction_2,1.204721
737,embedding_633,1.100763
74,textstat_sentence_count,1.086004
104,sentiment,1.000711


In [None]:
pushkin = """«Мой дядя самых честных правил,
Когда не в шутку занемог,
Он уважать себя заставил
И лучше выдумать не мог.
Его пример другим наука;
Но, боже мой, какая скука
С больным сидеть и день и ночь,
Не отходя ни шагу прочь!
Какое низкое коварство
Полуживого забавлять,
Ему подушки поправлять,
Печально подносить лекарство,
Вздыхать и думать про себя:
Когда же черт возьмет тебя!»"""

esenin = """Заметался пожар голубой,
Позабылись родимые дали.
В первый раз я запел про любовь,
В первый раз отрекаюсь скандалить.
Был я весь — как запущенный сад,
Был на женщин и зелие падкий.
Разонравилось пить и плясать
И терять свою жизнь без оглядки.
Мне бы только смотреть на тебя,
Видеть глаз злато-карий омут,
И чтоб, прошлое не любя,
Ты уйти не смогла к другому."""

gpt_random = """Скачет воробей на лужу,
Сыр с орехом в тёмном стуже.
Три колеса по лужайке плывут,
А в горле йогурт, а рядом салют.

Липнет клюква к потолку,
Жук танцует в молоку,
В небе радуга-тетрадь,
Как же тут не полетать?"""

top_poem_in_test = df_test[df_test["rating"] == df_test["rating"].max()][
    "output_text"
].values[0]

bottom_poem_in_test = df_test[df_test["views"] == df_test["views"].min()][
    "output_text"
].values[1]

random_words = """Озеро
Путешествие
Сияние
Листопад
Велосипед
Мечта
Зонтик
Река
Горизонт
Вдохновение
Фонарь
Ласточка
Ступенька"""

sanity_check = [
    pushkin,
    esenin,
    gpt_random,
    top_poem_in_test,
    bottom_poem_in_test,
    random_words,
]


def rank_poems(model, poems: list[str]) -> list[int]:

    df = pd.DataFrame({"output_text": poems})

    df = make_features(df)
    df["sbert_embeddings"] = df["output_text"].apply(
        lambda x: get_sentence_embedding_sbert(x).tolist()
    )

    sentiment_model = pipeline(
        model="seara/rubert-tiny2-russian-sentiment",
        padding=True,
        truncation=True,
        device="cpu",
    )

    def get_sentiment(text):
        result = sentiment_model(text)[0]
        return result["label"]

    df["sentiment"] = df["output_text"].apply(lambda x: get_sentiment(x))

    df = downcast_datatypes(df, cat_cols=["sentiment"])

    print(df.columns.to_list())

    print(X_train.columns.to_list())

    df = preprocess_df(df, predict=True)[X_train.columns]

    return model.predict(df)

In [None]:
ranks = rank_poems(top_model, sanity_check)

[38;5;4mℹ No spacy model provided. Inferring spacy model for ru.[0m




[38;5;3m⚠ Could not load lexeme probability table for language ru. This will
result in NaN values for perplexity and entropy.[0m


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

Memory consumed (in MB) before downsizing:  0.009698867797851562
Memory consumed (in MB) after downsizing:  0.004932403564453125
Memory usage decreased by (49.1% )
['output_text', 'textdescriptives_text', 'textdescriptives_entropy', 'textdescriptives_perplexity', 'textdescriptives_per_word_perplexity', 'textdescriptives_dependency_distance_mean', 'textdescriptives_dependency_distance_std', 'textdescriptives_prop_adjacent_dependency_relation_mean', 'textdescriptives_prop_adjacent_dependency_relation_std', 'textdescriptives_flesch_reading_ease', 'textdescriptives_flesch_kincaid_grade', 'textdescriptives_smog', 'textdescriptives_gunning_fog', 'textdescriptives_automated_readability_index', 'textdescriptives_coleman_liau_index', 'textdescriptives_lix', 'textdescriptives_rix', 'textdescriptives_token_length_mean', 'textdescriptives_token_length_median', 'textdescriptives_token_length_std', 'textdescriptives_sentence_length_mean', 'textdescriptives_sentence_length_median', 'textdescriptives_

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
ranks

array([ 7.80720096, 14.27790063,  7.04157306, 13.82798863,  4.9874075 ,
        3.27152934])