# Baseline using text similarity and XGB

Using sentence transformer model to compute similarity between summary and texts. Then train XGB to predict from similarity scores and texts sizes.

In [1]:
from pathlib import Path 
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
datapath = Path("../data/")

train_pro = pd.read_csv(datapath / "prompts_train.csv")
train_sum = pd.read_csv(datapath / "summaries_train.csv")

In [5]:
text_emb = {pid: emb for pid,emb in zip(train_pro.prompt_id.values, model.encode(train_pro.prompt_text.values))}
question_emb = {pid: emb for pid,emb in zip(train_pro.prompt_id.values, model.encode(train_pro.prompt_question.values))}

In [None]:
sum_emb = {sid: emb for sid,emb in zip(train_sum.student_id.values, model.encode(train_sum.text.values))}

In [None]:
cossim = {}
for k in text_emb.keys():
    cossim[k] = cosine_similarity(text_emb[k].reshape(1,-1), question_emb[k].reshape(1,-1))[0][0]

In [None]:
cs_text = []
cs_question = []
for sid, pid in train_sum[["student_id","prompt_id"]].values:
    cs_text.append(
        cosine_similarity(text_emb[pid].reshape(1,-1), sum_emb[sid].reshape(1,-1))[0][0]
    )
    cs_question.append(
        cosine_similarity(question_emb[pid].reshape(1,-1), sum_emb[sid].reshape(1,-1))[0][0]
    )

In [None]:
train_sum["cs_text"] = cs_text
train_sum["cs_question"] = cs_question

In [None]:
train_sum.head(2)

Features relacionadas com o tamanho do texto e do sumário

In [None]:
train_pro["ptext_words"] = train_pro.prompt_text.apply(lambda x: len(x.split()))
train_pro["ptext_chars"] = train_pro.prompt_text.str.len()

In [None]:
train_sum["stext_words"] = train_sum.text.apply(lambda x: len(x.split()))
train_sum["stext_chars"] = train_sum.text.str.len()

In [None]:
train_sum = train_sum.merge(train_pro[["prompt_id", "ptext_words", "ptext_chars"]], 
                            on="prompt_id", how="left")

In [None]:
# make fold numbers, one fold for each prompt_id
foldmapper = {v:i for i,v in enumerate(train_sum.prompt_id.unique())}
train_sum["fold"] = train_sum.prompt_id.map(foldmapper)

In [2]:
#train_sum.to_parquet("tmp_train.parquet", index=False)
train_sum = pd.read_parquet("tmp_train.parquet")

In [None]:
train = train_sum
feat = ["cs_text", "cs_question", "stext_words", "stext_chars", "ptext_words", "ptext_chars"]
targets = ["content", "wording"]
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(4):

        Xtrain = train[train["fold"] != fold][feat]
        ytrain = train[train["fold"] != fold][target]

        Xvalid = train[train["fold"] == fold][feat]
        yvalid = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(Xtrain, label=ytrain)
        dval = lgb.Dataset(Xvalid, label=yvalid)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.05,
            'max_depth': 3,
            'lambda_l1': 0,
            'lambda_l2': 0.01
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=1000,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                              lgb.log_evaluation(50)
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

In [None]:
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        Xeval = train[train["fold"] == fold][feat]
        yeval = train[train["fold"] == fold][target]

        pred = model.predict(Xeval)

        trues.extend(yeval)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

'4.0.0'