In [1]:
import pandas as pd

from ast import literal_eval

from tqdm import tqdm_notebook, tqdm

In [2]:
gold_standard = pd.read_csv(
    "gold_standard_annot_final.csv",
    sep=";",
    index_col="Unnamed: 0"
)
gold_standard["variants"] = gold_standard["variants"].apply(literal_eval)
gold_standard["Appropriate"] = gold_standard["Appropriate"].apply(literal_eval)
gold_standard["Too good"] = gold_standard["Too good"].apply(literal_eval)
gold_standard["Too bad"] = gold_standard["Too bad"].apply(literal_eval)

In [3]:
gold_standard

Unnamed: 0,Masked_sentence,Right_answer,Wrong_answer,Filename,Delete,variants,Appropriate,Too bad,Too good,Consistent,In duplicate names
153493,The amount of people who has no occupation in...,stable,the same,exam/Exam2017/OBy_100-200/2017_OBy_120_1,0,"[state, dependable, consistent, steady, prospe...","[consistent, harmonious, coherent]","[state, dependable, prosperous, volatile, reli...",[steady],True,False
55584,During the None six years the portion of rura...,rose,raised,exam/Old_Exam2015/2015_KT_15_1,1,"[raised, soared, surged, climbed, grew, fell, ...","[raised, soared, surged, jumped, leapt, leaped...","[fell, slipped, shrank, dipped, declined, prom...","[climbed, grew, ascended]",True,False
83294,Some politicians have come up with an idea to ...,disadvantages,backwards,exam/Exam2017/EGe_1-99/2017_EGe_19_2,0,"[cons, limitations, shortcomings, weaknesses, ...","[cons, limitations, weaknesses, pitfalls, prob...","[benefits, characteristics, alternatives, opti...","[shortcomings, risks, challenges, dangers, haz...",True,False
77723,"As for disadvantages, global warming and air ...",number,amount,exam/Exam2016/2016_MTsy_8_2,0,"[amount, quantity, level, part, value, member,...","[amount, quantity, count, rate, multiplicity]","[level, part, value, member, mark, category, p...","[proportion, multitude]",True,False
74220,It is slightly below 30°C in Yakutsk and 30°C...,trend,tendency,exam/Exam2017/ESa_1-69/2017_ESa_69_1,0,"[tendency, consistency, phenomenon, resurgence...","[tendency, upsurge, pattern, paradigm]","[consistency, phenomenon, resurgence, craze, f...",[shift],True,False
...,...,...,...,...,...,...,...,...,...,...,...
138436,7 minutes). Just in one case women's group of ...,doing,of goind,exam/Exam2020/Task_2_Essays_919_1896/2020_MLa_...,0,"[making, for, getting, pursuing, accomplishing...","[making, getting, pursuing, performing, going,...","[for, accomplishing, happening, seeing, indulg...","[enjoying, practicing]",True,False
73447,The decreasing unemployment in Latin America ...,acute,sharp,exam/Exam2017/NMya_1-108/2017_NMya_77_1,0,"[sharp, chronic, symptomatic, febrile, respira...","[sharp, incurable]","[chronic, symptomatic, febrile, respiratory, e...",[],True,False
160682,Low discipline in schools tends to result in ...,improving,repairing,exam/Old_Exam2014/2014_ZEv_5_2,0,"[repairing, enhancing, reducing, strengthening...","[repairing, enhancing, strengthening, boosting...","[reducing, alleviating, modernizing, lowering]","[maintaining, furthering, fostering, ensuring,...",True,False
93083,The increase of these health problems we are ...,eyesight,seeing,exam/Exam2017/OBy_1-99/2017_OBy_67_2,1,"[sight, vision, sanity, blindness, rheumatism,...",[],"[sanity, blindness, rheumatism, temper, comple...","[sight, vision]",True,False


Для итогового тестирования на золотом стандарте были отобраны модели с лучшей F1-мерой в различных условиях – модель XGBСlassifier, обученная на датасете со всеми признаками (XGBAllFeats), модель Случайного леса, обученная только на частотах (RandomForestFreqsOnly), модель CatBoost, обученная только на векторных представлениях (CatBoostVecsOnly) и модель CatBoost, обученная без учёта признака «Word2Vec-вектор слова-исправления» (CatBoostFeatDrop)

In [4]:
import os, pickle, json

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split

from typing import Any, List

In [5]:
df = pd.read_csv("processed_dataset_final.csv",sep=';',index_col="index")

train_sents, test_sents = train_test_split(
    df["sent_id"].unique(),
    test_size=0.2,
    random_state=42
)

df_train = df.loc[
    df["sent_id"].isin(train_sents)
]
df_test = df.loc[
    df["sent_id"].isin(test_sents)
]

X_train, y_train = df_train.drop(
    ["target", "target_true", "sent_id"],
    axis=1
), df_train["target_true"]
X_test, y_test = df_test.drop(
    ["target", "target_true", "sent_id"],
    axis=1,
), df_test["target_true"]

feats = [
    "bm",
    "wvc",
    "wve",
    "freq_corr", 
    "freq_err_corr",
    "freq_corr_corp",
    "freq_err_corp"
]

In [6]:
def train_and_save_clf(
    clf: Any,
    cols: List,
    clf_name: str
):
    clf.fit(X_train[cols], y_train)

    if not os.path.exists(clf_name):
        os.mkdir(clf_name)
    
    with open(f"{clf_name}/clf.pkl", 'wb') as outp:
        pickle.dump(clf, outp)
    
    with open(f"{clf_name}/cols.json", 'w', encoding='utf8') as outp:
        json.dump(cols, outp, ensure_ascii=False)

In [7]:
# XGBAllFeats
cols = [col for col in X_train.columns]
XGBAllFeats = XGBClassifier(random_state=42)
train_and_save_clf(XGBAllFeats, cols, "XGBAllFeats")

In [8]:
#RandomForestFreqsOnly
cols = ["freq_err_corp","freq_err_corr","freq_corr","freq_corr_corp"]
RandomForestFreqsOnly = RandomForestClassifier(random_state=42)
train_and_save_clf(RandomForestFreqsOnly, cols, "RandomForestFreqsOnly")

In [9]:
#CatBoostVecsOnly
cols = [
    col for col in X_train.columns if col not in
    ["freq_err_corp","freq_err_corr","freq_corr","freq_corr_corp"]
]
CatBoostVecsOnly = CatBoostClassifier(random_state=42)
train_and_save_clf(CatBoostVecsOnly, cols, "CatBoostVecsOnly")

Learning rate set to 0.014415
0:	learn: 0.6883157	total: 442ms	remaining: 7m 21s
1:	learn: 0.6843094	total: 680ms	remaining: 5m 39s
2:	learn: 0.6807621	total: 928ms	remaining: 5m 8s
3:	learn: 0.6775071	total: 1.17s	remaining: 4m 51s
4:	learn: 0.6735267	total: 1.41s	remaining: 4m 40s
5:	learn: 0.6693692	total: 1.63s	remaining: 4m 30s
6:	learn: 0.6666885	total: 1.87s	remaining: 4m 25s
7:	learn: 0.6633389	total: 2.1s	remaining: 4m 20s
8:	learn: 0.6597938	total: 2.33s	remaining: 4m 16s
9:	learn: 0.6562094	total: 2.57s	remaining: 4m 14s
10:	learn: 0.6534068	total: 2.8s	remaining: 4m 12s
11:	learn: 0.6507970	total: 3.04s	remaining: 4m 9s
12:	learn: 0.6487412	total: 3.27s	remaining: 4m 7s
13:	learn: 0.6463601	total: 3.5s	remaining: 4m 6s
14:	learn: 0.6437416	total: 3.73s	remaining: 4m 5s
15:	learn: 0.6413600	total: 3.96s	remaining: 4m 3s
16:	learn: 0.6391060	total: 4.24s	remaining: 4m 4s
17:	learn: 0.6373321	total: 4.46s	remaining: 4m 3s
18:	learn: 0.6345117	total: 4.69s	remaining: 4m 2s
19:	

In [10]:
#CatBoostFeatDrop
cols = [
    col for col in X_train.columns if not col.startswith("wvc")
]
CatBoostFeatDrop = CatBoostClassifier(random_state=42)
train_and_save_clf(CatBoostFeatDrop, cols, "CatBoostFeatDrop")


Learning rate set to 0.014415
0:	learn: 0.6895905	total: 271ms	remaining: 4m 30s
1:	learn: 0.6853986	total: 469ms	remaining: 3m 53s
2:	learn: 0.6811346	total: 688ms	remaining: 3m 48s
3:	learn: 0.6777428	total: 953ms	remaining: 3m 57s
4:	learn: 0.6743187	total: 1.17s	remaining: 3m 52s
5:	learn: 0.6706569	total: 1.38s	remaining: 3m 48s
6:	learn: 0.6676443	total: 1.58s	remaining: 3m 44s
7:	learn: 0.6642849	total: 1.79s	remaining: 3m 42s
8:	learn: 0.6613035	total: 2s	remaining: 3m 39s
9:	learn: 0.6582429	total: 2.2s	remaining: 3m 37s
10:	learn: 0.6553908	total: 2.39s	remaining: 3m 34s
11:	learn: 0.6523948	total: 2.59s	remaining: 3m 33s
12:	learn: 0.6495172	total: 2.79s	remaining: 3m 31s
13:	learn: 0.6472625	total: 2.99s	remaining: 3m 30s
14:	learn: 0.6435950	total: 3.18s	remaining: 3m 29s
15:	learn: 0.6403236	total: 3.39s	remaining: 3m 28s
16:	learn: 0.6377447	total: 3.58s	remaining: 3m 26s
17:	learn: 0.6351513	total: 3.78s	remaining: 3m 26s
18:	learn: 0.6329328	total: 3.97s	remaining: 3m 

Сохраним золотой стандарт:

In [11]:
gold_standard[
    ["Masked_sentence","Right_answer","Wrong_answer"]
].to_csv("gold_standard_input.csv", sep=";")

Вызовем из командной строки:

In [12]:
!python -m distractor_generator --filename gold_standard_input.csv --clf_path XGBAllFeats/clf.pkl --cols_path XGBAllFeats/cols.json --output_filename XGBAllFeats/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
!python -m distractor_generator --filename gold_standard_input.csv --clf_path RandomForestFreqsOnly/clf.pkl --cols_path RandomForestFreqsOnly/cols.json --output_filename RandomForestFreqsOnly/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
!python -m distractor_generator --filename gold_standard_input.csv --clf_path CatBoostVecsOnly/clf.pkl --cols_path CatBoostVecsOnly/cols.json --output_filename CatBoostVecsOnly/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
!python -m distractor_generator --filename gold_standard_input.csv --clf_path CatBoostFeatDrop/clf.pkl --cols_path CatBoostFeatDrop/cols.json --output_filename CatBoostFeatDrop/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Теперь будем сравнивать результаты

Бейзлайн - без классификатора

In [16]:
lengths = gold_standard["variants"].apply(len)
lengths.describe()

count    88.000000
mean     19.977273
std       0.213201
min      18.000000
25%      20.000000
50%      20.000000
75%      20.000000
max      20.000000
Name: variants, dtype: float64

In [41]:
Table = []

In [42]:
share_appr = gold_standard["Appropriate"].apply(len).sum()/gold_standard["variants"].apply(len).sum()
share_tg = gold_standard["Too good"].apply(len).sum()/gold_standard["variants"].apply(len).sum()
share_tb = gold_standard["Too bad"].apply(len).sum()/gold_standard["variants"].apply(len).sum()
row = {
    "method": "Baseline (no clf)",
    "Appropriate": share_appr,
    "Too bad": share_tb,
    "Too good": share_tg
}
print(row)
Table.append(row)

{'method': 'Baseline (no clf)', 'Appropriate': 0.2616609783845279, 'Too bad': 0.6416382252559727, 'Too good': 0.09726962457337884}


In [43]:
gold_standard.index

Int64Index([153493,  55584,  83294,  77723,  74220,  61226,  53390, 159237,
            136331,  49472, 145079, 112217,  37867,  51411,  73260,  75261,
            147504, 139612,  70105, 166891, 155105, 163703, 136326, 112011,
             73501, 117358, 144854,  83859,   2824, 162996,  83857,  64171,
             72719,  95508,  53309,  51255, 104446,  97146, 147954,  81630,
             40272, 119391, 134577,  58525, 104023,  47846, 137728,  87222,
            151707, 150385, 101353, 142476,  69141, 164344,  40649, 156950,
             74783,  86582,  48139,  75292,  53275, 152859, 101634,  65050,
            102660,  99534,  71531, 158203,  82145, 163852, 160556, 154863,
             45811, 148945, 152775,  66435, 134088,  77717,  65130, 120146,
            104386,  82379, 153675, 138436,  73447, 160682,  93083,  42611],
           dtype='int64')

In [44]:
XGBAllFeats_out = pd.read_csv("XGBAllFeats/output.csv", index_col="Unnamed: 0", sep=';')
RandomForestFreqsOnly_out = pd.read_csv("RandomForestFreqsOnly/output.csv", index_col="Unnamed: 0", sep=';')
CatBoostVecsOnly_out = pd.read_csv("CatBoostVecsOnly/output.csv", index_col="Unnamed: 0", sep=';')
CatBoostFeatDrop_out = pd.read_csv("CatBoostFeatDrop/output.csv", index_col="Unnamed: 0", sep=';')


In [45]:
list(XGBAllFeats_out.index) == list(gold_standard.index)

True

In [46]:
list(RandomForestFreqsOnly_out.index) == list(gold_standard.index)

True

In [47]:
list(CatBoostVecsOnly_out.index) == list(gold_standard.index)

True

In [48]:
list(CatBoostFeatDrop_out.index) == list(gold_standard.index)

True

In [50]:
XGBAllFeats_out

Unnamed: 0,Masked_sentence,Right_answer,Wrong_answer,variants
153493,The amount of people who has no occupation in...,stable,the same,"['state', 'dependable', 'consistent', 'prosper..."
55584,During the None six years the portion of rura...,rose,raised,[]
83294,Some politicians have come up with an idea to ...,disadvantages,backwards,"['cons', 'limitations', 'shortcomings', 'weakn..."
77723,"As for disadvantages, global warming and air ...",number,amount,"['amount', 'quantity', 'level', 'part', 'value..."
74220,It is slightly below 30°C in Yakutsk and 30°C...,trend,tendency,"['tendency', 'craze', 'upsurge', 'pattern', 'p..."
...,...,...,...,...
138436,7 minutes). Just in one case women's group of ...,doing,of goind,"['making', 'for', 'getting', 'pursuing', 'acco..."
73447,The decreasing unemployment in Latin America ...,acute,sharp,"['sharp', 'symptomatic', 'febrile', 'myeloid',..."
160682,Low discipline in schools tends to result in ...,improving,repairing,"['repairing', 'enhancing', 'boosting', 'mainta..."
93083,The increase of these health problems we are ...,eyesight,seeing,"['sight', 'rheumatism', 'temper', 'complexion'..."


In [51]:
def estimate_output(
    output_df: pd.DataFrame,
    method: str
):
    output_df["variants"] = output_df["variants"].apply(literal_eval)
    s = output_df["variants"].apply(len).sum()
    appr, tg, tb = 0, 0, 0
    for idx in gold_standard.index:
        appr += len(set(output_df.loc[idx]["variants"]) & set(gold_standard.loc[idx]["Appropriate"]))
        tg += len(set(output_df.loc[idx]["variants"]) & set(gold_standard.loc[idx]["Too good"]))
        tb += len(set(output_df.loc[idx]["variants"]) & set(gold_standard.loc[idx]["Too bad"]))
    
    return {
        "method": method,
        "Appropriate": appr/s,
        "Too bad": tb/s,
        "Too good": tg/s
    }


In [52]:
for df, name in zip(
    [XGBAllFeats_out, RandomForestFreqsOnly_out, CatBoostVecsOnly_out, CatBoostFeatDrop_out],
    ["XGBAllFeats_out", "RandomForestFreqsOnly_out", "CatBoostVecsOnly_out", "CatBoostFeatDrop_out"]
):
    Table.append(estimate_output(df,name))

In [28]:
Table = pd.DataFrame(Table)

In [29]:
Table.sort_values(by=["Appropriate"], ascending=False)

Unnamed: 0,method,Appropriate,Too bad,Too good
4,CatBoostFeatDrop_out,0.279014,0.612083,0.108903
2,RandomForestFreqsOnly_out,0.27766,0.634043,0.088298
1,XGBAllFeats_out,0.268016,0.631579,0.100405
0,Baseline (no clf),0.261661,0.641638,0.09727
3,CatBoostVecsOnly_out,0.260652,0.634085,0.105263


In [30]:
Table.to_excel("gold_standard_performance.xlsx")

Ещё - можно попробовать поиграться с N - брать от 3 до 20 - можно заново не проводить классификацию, просто брать дистракторы в порядке старшинства

In [31]:
gold_standard["variants"]

153493    [state, dependable, consistent, steady, prospe...
55584     [raised, soared, surged, climbed, grew, fell, ...
83294     [cons, limitations, shortcomings, weaknesses, ...
77723     [amount, quantity, level, part, value, member,...
74220     [tendency, consistency, phenomenon, resurgence...
                                ...                        
138436    [making, for, getting, pursuing, accomplishing...
73447     [sharp, chronic, symptomatic, febrile, respira...
160682    [repairing, enhancing, reducing, strengthening...
93083     [sight, vision, sanity, blindness, rheumatism,...
42611     [meal, goal, seafood, meat, nutrition, beverag...
Name: variants, Length: 88, dtype: object

In [32]:
def estimate_ouput_with_N(
    output_df: pd.DataFrame,
    N: int,
    method: str
):
    gs1 = gold_standard.copy()
    df1 = output_df.copy()

    gs1["variants"] = gs1["variants"].apply(lambda x: x[:N])
    gs1["Appropriate"] = gs1.apply(
        lambda x: [i for i in x["Appropriate"] if i in x["variants"]],
        axis=1
    )
    gs1["Too good"] = gs1.apply(
        lambda x: [i for i in x["Too good"] if i in x["variants"]],
        axis=1
    )
    gs1["Too bad"] = gs1.apply(
        lambda x: [i for i in x["Too bad"] if i in x["variants"]],
        axis=1
    )

    df1["variants"] = df1.apply(
        lambda x: [i for i in x["variants"] if i in gs1.loc[x.name]["variants"]],
        axis=1
    )

    s = df1["variants"].apply(len).sum()
    c = df1["variants"].apply(len).mean()
    appr, tg, tb = 0, 0, 0
    appr_sent = []
    tg_sent = []
    tb_sent = []

    for idx in gs1.index:
        appr_i = len(set(df1.loc[idx]["variants"]) & set(gs1.loc[idx]["Appropriate"]))
        tg_i = len(set(df1.loc[idx]["variants"]) & set(gs1.loc[idx]["Too good"]))
        tb_i = len(set(df1.loc[idx]["variants"]) & set(gs1.loc[idx]["Too bad"]))

        appr += appr_i
        tg += tg_i
        tb += tb_i

        if df1.loc[idx]["variants"]:
            appr_sent.append(appr_i/len(df1.loc[idx]["variants"]))
            tg_sent.append(tg_i/len(df1.loc[idx]["variants"]))
            tb_sent.append(tb_i/len(df1.loc[idx]["variants"]))
        else:
            appr_sent.append(0)
            tg_sent.append(0)
            tb_sent.append(0)
        
    
    return {
        "method": method,
        "N": N,
        "Appropriate (whole)": appr/s,
        "Too bad (whole)": tb/s,
        "Too good (whole)": tg/s,
        "Appropriate (by sent)": pd.Series(appr_sent).mean(),
        "Too bad (by sent)": pd.Series(tb_sent).mean(),
        "Too good (by sent)": pd.Series(tg_sent).mean(),
        "N distractors": c
    }


Будем перебирать N:

In [33]:
Table1 = []

for N in tqdm_notebook(range(3, 21), total=18):
    for df, name in zip(
        [gold_standard, XGBAllFeats_out, RandomForestFreqsOnly_out, CatBoostVecsOnly_out, CatBoostFeatDrop_out],
        ["Baseline (no clf)", "XGBAllFeats_out", "RandomForestFreqsOnly_out", "CatBoostVecsOnly_out", "CatBoostFeatDrop_out"]
    ):
        Table1.append(
            estimate_ouput_with_N(df, N, name)
        )

Table1 = pd.DataFrame(Table1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for N in tqdm_notebook(range(3, 21), total=18):


  0%|          | 0/18 [00:00<?, ?it/s]

In [34]:
Table1.sort_values(by=["Appropriate (whole)"], ascending=False)

Unnamed: 0,method,N,Appropriate (whole),Too bad (whole),Too good (whole),Appropriate (by sent),Too bad (by sent),Too good (by sent),N distractors
1,XGBAllFeats_out,3,0.552381,0.328571,0.119048,0.541667,0.293561,0.142045,2.386364
3,CatBoostVecsOnly_out,3,0.540541,0.329730,0.129730,0.522727,0.289773,0.119318,2.102273
2,RandomForestFreqsOnly_out,3,0.536313,0.324022,0.139665,0.477273,0.285985,0.123106,2.034091
4,CatBoostFeatDrop_out,3,0.530973,0.331858,0.137168,0.547348,0.316288,0.136364,2.568182
8,CatBoostVecsOnly_out,4,0.516529,0.355372,0.128099,0.504735,0.307765,0.130682,2.750000
...,...,...,...,...,...,...,...,...,...
80,Baseline (no clf),19,0.269300,0.630162,0.100539,0.269139,0.630383,0.100478,18.988636
86,XGBAllFeats_out,20,0.268016,0.631579,0.100405,0.298722,0.595017,0.094897,14.034091
83,CatBoostVecsOnly_out,19,0.266667,0.625439,0.107895,0.315630,0.576335,0.096672,12.954545
85,Baseline (no clf),20,0.261661,0.641069,0.097270,0.261364,0.641477,0.097159,19.977273


In [35]:
Table1.sort_values(by=["Appropriate (by sent)"], ascending=False)

Unnamed: 0,method,N,Appropriate (whole),Too bad (whole),Too good (whole),Appropriate (by sent),Too bad (by sent),Too good (by sent),N distractors
4,CatBoostFeatDrop_out,3,0.530973,0.331858,0.137168,0.547348,0.316288,0.136364,2.568182
1,XGBAllFeats_out,3,0.552381,0.328571,0.119048,0.541667,0.293561,0.142045,2.386364
3,CatBoostVecsOnly_out,3,0.540541,0.329730,0.129730,0.522727,0.289773,0.119318,2.102273
9,CatBoostFeatDrop_out,4,0.496528,0.368056,0.135417,0.515152,0.350379,0.134470,3.272727
6,XGBAllFeats_out,4,0.509158,0.377289,0.113553,0.515152,0.345644,0.116477,3.102273
...,...,...,...,...,...,...,...,...,...
65,Baseline (no clf),16,0.293324,0.601562,0.105114,0.293324,0.601562,0.105114,16.000000
70,Baseline (no clf),17,0.285428,0.610294,0.104278,0.285428,0.610294,0.104278,17.000000
75,Baseline (no clf),18,0.275253,0.622475,0.102273,0.275253,0.622475,0.102273,18.000000
80,Baseline (no clf),19,0.269300,0.630162,0.100539,0.269139,0.630383,0.100478,18.988636


In [36]:
Table1.loc[Table1["N distractors"]>3].sort_values(by=["Appropriate (by sent)"], ascending=False)

Unnamed: 0,method,N,Appropriate (whole),Too bad (whole),Too good (whole),Appropriate (by sent),Too bad (by sent),Too good (by sent),N distractors
9,CatBoostFeatDrop_out,4,0.496528,0.368056,0.135417,0.515152,0.350379,0.134470,3.272727
6,XGBAllFeats_out,4,0.509158,0.377289,0.113553,0.515152,0.345644,0.116477,3.102273
11,XGBAllFeats_out,5,0.473214,0.416667,0.110119,0.482955,0.384091,0.110227,3.818182
13,CatBoostVecsOnly_out,5,0.470588,0.408497,0.120915,0.475568,0.357576,0.121402,3.477273
5,Baseline (no clf),4,0.474432,0.397727,0.127841,0.474432,0.397727,0.127841,4.000000
...,...,...,...,...,...,...,...,...,...
65,Baseline (no clf),16,0.293324,0.601562,0.105114,0.293324,0.601562,0.105114,16.000000
70,Baseline (no clf),17,0.285428,0.610294,0.104278,0.285428,0.610294,0.104278,17.000000
75,Baseline (no clf),18,0.275253,0.622475,0.102273,0.275253,0.622475,0.102273,18.000000
80,Baseline (no clf),19,0.269300,0.630162,0.100539,0.269139,0.630383,0.100478,18.988636


In [37]:
Table1.to_csv("ParamAndClfSelection.csv", sep=';')
Table1.to_excel("ParamAndClfSelection.xlsx", float_format="%.4f")

Получим аутпут от лучшей модели:

In [2]:
!python -m distractor_generator --n 4 --filename gold_standard_input.csv --clf_path XGBAllFeats/clf.pkl --cols_path XGBAllFeats/cols.json --output_filename best_model_output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
