In [147]:
import os

import textstat
import numpy as np
import pandas as pd


    

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [148]:
prompts_train = pd.read_csv("input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("input/commonlit-evaluate-student-summaries/prompts_test.csv")

summaries_train = pd.read_csv("input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("input/commonlit-evaluate-student-summaries/summaries_test.csv")

sample_submission = pd.read_csv("input/commonlit-evaluate-student-summaries/sample_submission.csv")

prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [149]:
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

train

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...


In [150]:
train["is_train"] = 1
test["is_train"] = 0
all_data = pd.concat([train, test])

all_data.sample(10)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,is_train
1483,35825619b6f4,ebad26,One way is they chop it up in sausages which i...,-1.487523,-1.125956,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1
2158,4cf8fb9d7b2d,39c16e,"An ideal tragedy, as described by Aristotle, p...",-0.002466,-0.045439,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,1
1549,37b0391aaaf2,ebad26,"In paragraph 2, it is stated that they used so...",-1.427883,-0.790667,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1
4954,aff2c864c54e,39c16e,"Three elements of an ideal tragedy, as describ...",0.388379,-0.718005,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,1
2120,4baa0481f451,39c16e,The character cannot be a paragon of good or e...,0.44002,1.133263,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,1
27,01027180ef7c,3b9047,After reading the article I learned about ...,-1.114202,0.594411,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,1
773,1bce1188bcdb,ebad26,They vaaries ways that the factory would usese...,-1.264214,-1.505073,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1
7029,fb1d4fdb9f4c,3b9047,"pharaoh is at top, then noblemen and priests, ...",-1.547163,-1.461245,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,1
3517,7d00ceb28ddb,3b9047,The social structure is like a pyramid. The hi...,-0.15746,-0.165811,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,1
1487,35af86bb073b,ebad26,There were many way that they hide such as mix...,-1.033882,-1.086703,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",1


In [151]:
len(all_data)

7169

In [152]:
all_data = pd.read_csv('sample.csv')

In [153]:
(all_data.loc[:, 'content']).std()

1.0435693830622939

In [154]:
import textdistance
import tqdm


# textdistance.levenshtein(all_data.iloc[0].text, all_data.iloc[10].text)
# has_almost_duplicate = np.zeros(len(all_data))
# all_data_text = all_data["text"].values
# for i in tqdm.tqdm(range(len(all_data))):
#     for j in range(i+1, len(all_data)):
#         print(f"{j=}")
#         if (has_almost_duplicate[i] != 1 or has_almost_duplicate[j] != 1):
#             text_i = all_data_text[i]
#             text_j = all_data_text[j]
#             ldist = textdistance.levenshtein(text_i, text_j)
#             if ldist < 5:
#                 has_almost_duplicate[i] = 1
#                 has_almost_duplicate[j] = 1
# #                 break
                
# all_data["has_almost_duplicate"] = has_almost_duplicate
train = all_data[all_data["is_train"] == 1]
test = all_data[all_data["is_train"] == 0]

# drop is_train column
train.drop("is_train", axis=1, inplace=True)
test.drop("is_train", axis=1, inplace=True)

train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop("is_train", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop("is_train", axis=1, inplace=True)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,has_almost_duplicate
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",0.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,0.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,0.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0.0


In [155]:
train = all_data[all_data["is_train"] == 1]
test = all_data[all_data["is_train"] == 0]

# drop is_train column
train.drop("is_train", axis=1, inplace=True)
test.drop("is_train", axis=1, inplace=True)

train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop("is_train", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop("is_train", axis=1, inplace=True)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,has_almost_duplicate
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",0.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,0.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,0.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,0.0


Let's create some features using text statistics and use classic ML algorithm

In [156]:
import nltk
from nltk.tokenize import word_tokenize


STOPWORDS = pd.read_csv("input/nltk-english-stopwords/nltk_eng_stopwords.csv")["list_of_stopwords"].tolist()
def get_stopwords_rel(text):
    text_words = word_tokenize(text)
    num_stopwords = sum([word in STOPWORDS for word in text_words])
    
    return num_stopwords/len(text_words)

In [157]:
from tqdm.notebook import tqdm
tqdm.pandas()


def get_stat_features(df, text_col="text"):
    df["num_unique_words"] = df[text_col].progress_apply(lambda x: len(set(x.split())))
    df["num_words"] = df[text_col].progress_apply(lambda x: len(x.split()))
    df["num_sentences"] = df[text_col].progress_apply(lambda x: len(x.split('.')))
    df["isupper"] = df[text_col].progress_apply(lambda x: x[0].isupper())
    df["mean_num_words"] = df[text_col].progress_apply(lambda x: np.mean([len(e.split()) for e in x.split('.')]))
    df["mean_num_unique_words"] = df[text_col].progress_apply(lambda x: np.mean([len(set(e.split())) for e in x.split('.')]))
    df["num_slash"] = df[text_col].progress_apply(lambda x: x.count("\n"))
    df["paragraph_count"] = df[text_col].progress_apply(lambda x: x.count("\n\n"))
    df["upper_count"] = df[text_col].progress_apply(lambda x: np.sum([w.isupper() for w in x.split()])/len(x.split()))
    df["syntax_count"] = df[text_col].progress_apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))
    df["is_end_with_dot"] = df[text_col].progress_apply(lambda x: int(x[-1] == "."))
    df["stopwords_rel"] = df[text_col].progress_apply(lambda x: get_stopwords_rel(x))
    df["lcsstr"] = df.progress_apply(lambda row: textdistance.lcsstr(row.text, row.prompt_text).split().__len__(), axis=1)
    df['automated_readability_index'] = df[text_col].progress_apply(lambda x: textstat.automated_readability_index(x))
    df['coleman_liau_index'] = df[text_col].progress_apply(lambda x: textstat.coleman_liau_index(x))
    df['smog_index'] = df[text_col].progress_apply(lambda x: textstat.smog_index(x))
    df['dale_chall_readability_score'] = df[text_col].progress_apply(lambda x: textstat.dale_chall_readability_score(x))
    df['linsear_write_formula'] = df[text_col].progress_apply(lambda x: textstat.linsear_write_formula(x))
    df['gunning_fog'] = df[text_col].progress_apply(lambda x: textstat.gunning_fog(x))
    df['text_standard_float'] = df[text_col].progress_apply(lambda x: textstat.text_standard(x, float_output=True))
    df['spache_readability'] = df[text_col].progress_apply(lambda x: textstat.spache_readability(x))
    df['rix'] = df[text_col].progress_apply(lambda x: textstat.rix(x))
    df['lix'] = df[text_col].progress_apply(lambda x: textstat.lix(x))

    return df
    
train_feat = get_stat_features(train)
test_feat = get_stat_features(test)

  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_unique_words"] = df[text_col].progress_apply(lambda x: len(set(x.split())))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_words"] = df[text_col].progress_apply(lambda x: len(x.split()))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_sentences"] = df[text_col].progress_apply(lambda x: len(x.split('.')))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["isupper"] = df[text_col].progress_apply(lambda x: x[0].isupper())


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mean_num_words"] = df[text_col].progress_apply(lambda x: np.mean([len(e.split()) for e in x.split('.')]))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["mean_num_unique_words"] = df[text_col].progress_apply(lambda x: np.mean([len(set(e.split())) for e in x.split('.')]))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["num_slash"] = df[text_col].progress_apply(lambda x: x.count("\n"))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["paragraph_count"] = df[text_col].progress_apply(lambda x: x.count("\n\n"))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["upper_count"] = df[text_col].progress_apply(lambda x: np.sum([w.isupper() for w in x.split()])/len(x.split()))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["syntax_count"] = df[text_col].progress_apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_end_with_dot"] = df[text_col].progress_apply(lambda x: int(x[-1] == "."))


  0%|          | 0/7165 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["stopwords_rel"] = df[text_col].progress_apply(lambda x: get_stopwords_rel(x))


  0%|          | 0/7165 [00:00<?, ?it/s]

Let's see corr matrix:

In [None]:
NO_FEATURES = ["student_id", "prompt_id", "prompt_question", "prompt_title", "prompt_text", "text"]
TARGETS = ["content", "wording"]
FEATURES = [col for col in train.columns if col not in NO_FEATURES + TARGETS]

corr = train_feat[TARGETS + FEATURES].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,content,wording,has_almost_duplicate,num_unique_words,num_words,num_sentences,isupper,mean_num_words,mean_num_unique_words,num_slash,paragraph_count,upper_count,syntax_count,is_end_with_dot,stopwords_rel
content,1.0,0.75138,-0.106674,0.806767,0.792626,0.650758,0.133701,0.072942,0.123573,-0.019386,-0.020365,-0.056648,0.530829,0.017627,-0.126915
wording,0.75138,1.0,-0.118572,0.544271,0.536343,0.45328,0.151596,-0.049175,-0.003145,-0.043161,-0.035557,-0.043505,0.263644,0.067158,-0.056193
has_almost_duplicate,-0.106674,-0.118572,1.0,-0.047016,-0.050289,-0.052009,-0.034697,-0.000578,0.003961,0.03664,0.041191,-0.011511,0.027217,-0.014226,-0.048695
num_unique_words,0.806767,0.544271,-0.047016,1.0,0.981951,0.77366,0.066029,0.188446,0.244561,0.023555,0.007132,-0.045243,0.749152,-0.026137,-0.108172
num_words,0.792626,0.536343,-0.050289,0.981951,1.0,0.769209,0.054544,0.206381,0.24606,0.019315,0.00343,-0.044073,0.7441,-0.025955,-0.063116
num_sentences,0.650758,0.45328,-0.052009,0.77366,0.769209,1.0,0.080648,-0.242674,-0.225644,0.024248,0.012686,-0.010049,0.565643,0.028415,-0.254418
isupper,0.133701,0.151596,-0.034697,0.066029,0.054544,0.080648,1.0,-0.130661,-0.107555,-0.023311,-0.009534,0.027713,0.029402,0.055557,-0.136962
mean_num_words,0.072942,-0.049175,-0.000578,0.188446,0.206381,-0.242674,-0.130661,1.0,0.970844,-0.010526,-0.015144,-0.050575,0.170815,-0.181053,0.341735
mean_num_unique_words,0.123573,-0.003145,0.003961,0.244561,0.24606,-0.225644,-0.107555,0.970844,1.0,-0.009764,-0.014557,-0.051754,0.208938,-0.203701,0.305315
num_slash,-0.019386,-0.043161,0.03664,0.023555,0.019315,0.024248,-0.023311,-0.010526,-0.009764,1.0,0.81029,-0.001781,0.028108,-0.021674,-0.034665


Learning catboost and check metric:

In [None]:
from sklearn.model_selection import GroupKFold
from catboost import CatBoostRegressor


gfk = GroupKFold(n_splits=4)
train_oof = np.zeros((len(train_feat), 2))
test_pred = np.zeros((len(test_feat), 2))
X, y = train[FEATURES], train[TARGETS]
for train_index, val_index in gfk.split(train_feat, groups=train_feat["prompt_id"]):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]
    
    model = CatBoostRegressor(random_state=42, max_depth=4, 
                              objective="MultiRMSE")
    model.fit(X_train, y_train, eval_set=(X_val, y_val), metric_period=100)
    
    train_oof[val_index] = model.predict(X_val)
    test_pred +=  model.predict(test[FEATURES])/4

0:	learn: 1.4609067	test: 1.4214483	best: 1.4214483 (0)	total: 1.49ms	remaining: 1.49s
100:	learn: 0.9446117	test: 0.8770675	best: 0.8770675 (100)	total: 164ms	remaining: 1.46s
200:	learn: 0.9147492	test: 0.8551387	best: 0.8551387 (200)	total: 339ms	remaining: 1.35s
300:	learn: 0.8993352	test: 0.8511568	best: 0.8511568 (300)	total: 485ms	remaining: 1.13s
400:	learn: 0.8864320	test: 0.8486393	best: 0.8486393 (400)	total: 633ms	remaining: 946ms
500:	learn: 0.8764329	test: 0.8495621	best: 0.8486393 (400)	total: 767ms	remaining: 764ms
600:	learn: 0.8683174	test: 0.8503087	best: 0.8486393 (400)	total: 890ms	remaining: 591ms
700:	learn: 0.8609089	test: 0.8501671	best: 0.8486393 (400)	total: 1.02s	remaining: 434ms
800:	learn: 0.8547572	test: 0.8506058	best: 0.8486393 (400)	total: 1.14s	remaining: 283ms
900:	learn: 0.8482012	test: 0.8510134	best: 0.8486393 (400)	total: 1.28s	remaining: 141ms
999:	learn: 0.8418819	test: 0.8511304	best: 0.8486393 (400)	total: 1.45s	remaining: 0us

bestTest = 0.8

Checking competition metric:

In [None]:
compute_mcrmse((train_oof, train[TARGETS]))

  "content_rmse": col_rmse[0],
  "wording_rmse": col_rmse[1],


{'content_rmse': 0.5393559394901973,
 'wording_rmse': 0.8175663748779349,
 'mcrmse': 0.6784611571840662}

Not bad!We need submit it!

In [None]:
test

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,has_almost_duplicate,num_unique_words,...,num_sentences,isupper,mean_num_words,mean_num_unique_words,num_slash,paragraph_count,upper_count,syntax_count,is_end_with_dot,stopwords_rel
7165,000000ffffff,abc123,Example text 1,,,Summarize...,Example Title 1,Heading\nText...,1.0,3,...,1,True,3.0,3.0,0,0,0.0,0,0,0.0
7166,111111eeeeee,def789,Example text 2,,,Summarize...,Example Title 2,Heading\nText...,1.0,3,...,1,True,3.0,3.0,0,0,0.0,0,0,0.0
7167,222222cccccc,abc123,Example text 3,,,Summarize...,Example Title 1,Heading\nText...,1.0,3,...,1,True,3.0,3.0,0,0,0.0,0,0,0.0
7168,333333dddddd,def789,Example text 4,,,Summarize...,Example Title 2,Heading\nText...,1.0,3,...,1,True,3.0,3.0,0,0,0.0,0,0,0.0


In [None]:
sample_submission["content"] = test_pred[:, 0]
sample_submission["wording"] = test_pred[:, 1]

sample_submission.to_csv("submission.csv", index=False)

The main idea of this notebook, inspire comunity to not only train transformers, but to find new interesting solutions!