In [1]:
import os
os.chdir("..")

import pandas as pd

from ast import literal_eval

from tqdm import tqdm_notebook, tqdm

In [2]:
gold_standard = pd.read_csv(
    "gold_standard/gold_standard_annot_final.csv",
    sep=";",
    index_col="Unnamed: 0"
)
gold_standard["variants"] = gold_standard["variants"].apply(literal_eval)
gold_standard["Appropriate"] = gold_standard["Appropriate"].apply(literal_eval)
gold_standard["Too good"] = gold_standard["Too good"].apply(literal_eval)
gold_standard["Too bad"] = gold_standard["Too bad"].apply(literal_eval)

In [3]:
gold_standard

Unnamed: 0,Masked_sentence,Right_answer,Wrong_answer,Filename,Delete,variants,Appropriate,Too bad,Too good,Consistent,In duplicate names
153493,The amount of people who has no occupation in...,stable,the same,exam/Exam2017/OBy_100-200/2017_OBy_120_1,0,"[state, dependable, consistent, steady, prospe...","[consistent, harmonious, coherent]","[state, dependable, prosperous, volatile, reli...",[steady],True,False
83294,Some politicians have come up with an idea to ...,disadvantages,backwards,exam/Exam2017/EGe_1-99/2017_EGe_19_2,0,"[cons, limitations, shortcomings, weaknesses, ...","[cons, limitations, weaknesses, pitfalls, prob...","[benefits, characteristics, alternatives, opti...","[shortcomings, risks, challenges, dangers, haz...",True,False
77723,"As for disadvantages, global warming and air ...",number,amount,exam/Exam2016/2016_MTsy_8_2,0,"[amount, quantity, level, part, value, member,...","[amount, quantity, count, rate, multiplicity]","[level, part, value, member, mark, category, p...","[proportion, multitude]",True,False
74220,It is slightly below 30°C in Yakutsk and 30°C...,trend,tendency,exam/Exam2017/ESa_1-69/2017_ESa_69_1,0,"[tendency, consistency, phenomenon, resurgence...","[tendency, upsurge, pattern, paradigm]","[consistency, phenomenon, resurgence, craze, f...",[shift],True,False
53390,The number of men who are aged between 15 and...,number,part,exam/Exam2014/2014_EPa_22_1,0,"[amount, quantity, level, value, member, count...","[amount, quantity, count, rate, portion, total]","[level, value, member, mark, category, quality...",[proportion],True,False
...,...,...,...,...,...,...,...,...,...,...,...
153675,"So, the national population usually becomes p...",take,move,exam/Exam2020/Task_2_Essays_919_1896/2020_MLa_...,0,"[make, drink, get, move, share, go, taking, gi...","[get, move, hold, bring, send, carry]","[make, drink, share, go, taking, give, relinqu...",[keep],True,False
138436,7 minutes). Just in one case women's group of ...,doing,of goind,exam/Exam2020/Task_2_Essays_919_1896/2020_MLa_...,0,"[making, for, getting, pursuing, accomplishing...","[making, getting, pursuing, performing, going,...","[for, accomplishing, happening, seeing, indulg...","[enjoying, practicing]",True,False
73447,The decreasing unemployment in Latin America ...,acute,sharp,exam/Exam2017/NMya_1-108/2017_NMya_77_1,0,"[sharp, chronic, symptomatic, febrile, respira...","[sharp, incurable]","[chronic, symptomatic, febrile, respiratory, e...",[],True,False
160682,Low discipline in schools tends to result in ...,improving,repairing,exam/Old_Exam2014/2014_ZEv_5_2,0,"[repairing, enhancing, reducing, strengthening...","[repairing, enhancing, strengthening, boosting...","[reducing, alleviating, modernizing, lowering]","[maintaining, furthering, fostering, ensuring,...",True,False


Для итогового тестирования на золотом стандарте были отобраны модели с лучшей F1-мерой в различных условиях – модель XGBСlassifier, обученная на датасете со всеми признаками (XGBAllFeats), модель Случайного леса, обученная только на частотах (RandomForestFreqsOnly), модель CatBoost, обученная только на векторных представлениях (CatBoostVecsOnly) и модель CatBoost, обученная без учёта признака «Word2Vec-вектор слова-исправления» (CatBoostFeatDrop)

In [4]:
import os, pickle, json

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split

from typing import Any, List

In [5]:
df = pd.read_csv("data/processed_dataset_final.csv",sep=';',index_col="index")

train_sents, test_sents = train_test_split(
    df["sent_id"].unique(),
    test_size=0.2,
    random_state=42
)

df_train = df.loc[
    df["sent_id"].isin(train_sents)
]
df_test = df.loc[
    df["sent_id"].isin(test_sents)
]

X_train, y_train = df_train.drop(
    ["target", "target_true", "sent_id"],
    axis=1
), df_train["target_true"]
X_test, y_test = df_test.drop(
    ["target", "target_true", "sent_id"],
    axis=1,
), df_test["target_true"]

feats = [
    "bm",
    "wvc",
    "wve",
    "freq_corr", 
    "freq_err_corr",
    "freq_corr_corp",
    "freq_err_corp"
]

In [6]:
def train_and_save_clf(
    clf: Any,
    cols: List,
    clf_name: str
):
    clf.fit(X_train[cols], y_train)

    if not os.path.exists(clf_name):
        os.mkdir(clf_name)
    
    with open(f"{clf_name}/clf.pkl", 'wb') as outp:
        pickle.dump(clf, outp)
    
    with open(f"{clf_name}/cols.json", 'w', encoding='utf8') as outp:
        json.dump(cols, outp, ensure_ascii=False)

In [7]:
# XGBAllFeats
cols = [col for col in X_train.columns]
XGBAllFeats = XGBClassifier(random_state=42)
train_and_save_clf(XGBAllFeats, cols, "XGBAllFeats")

In [8]:
#RandomForestFreqsOnly
cols = ["freq_err_corp","freq_err_corr","freq_corr","freq_corr_corp"]
RandomForestFreqsOnly = RandomForestClassifier(random_state=42)
train_and_save_clf(RandomForestFreqsOnly, cols, "RandomForestFreqsOnly")

In [9]:
#CatBoostVecsOnly
cols = [
    col for col in X_train.columns if col not in
    ["freq_err_corp","freq_err_corr","freq_corr","freq_corr_corp"]
]
CatBoostVecsOnly = CatBoostClassifier(random_state=42)
train_and_save_clf(CatBoostVecsOnly, cols, "CatBoostVecsOnly")

Learning rate set to 0.014415
0:	learn: 0.6883157	total: 430ms	remaining: 7m 9s
1:	learn: 0.6843094	total: 779ms	remaining: 6m 28s
2:	learn: 0.6807621	total: 1.15s	remaining: 6m 23s
3:	learn: 0.6775071	total: 1.54s	remaining: 6m 22s
4:	learn: 0.6735267	total: 1.89s	remaining: 6m 16s
5:	learn: 0.6693692	total: 2.35s	remaining: 6m 29s
6:	learn: 0.6666885	total: 2.77s	remaining: 6m 33s
7:	learn: 0.6633389	total: 3.11s	remaining: 6m 25s
8:	learn: 0.6597938	total: 3.44s	remaining: 6m 18s
9:	learn: 0.6562094	total: 3.81s	remaining: 6m 17s
10:	learn: 0.6534068	total: 4.12s	remaining: 6m 10s
11:	learn: 0.6507970	total: 4.5s	remaining: 6m 10s
12:	learn: 0.6487412	total: 4.83s	remaining: 6m 6s
13:	learn: 0.6463601	total: 5.19s	remaining: 6m 5s
14:	learn: 0.6437416	total: 5.52s	remaining: 6m 2s
15:	learn: 0.6413600	total: 5.94s	remaining: 6m 5s
16:	learn: 0.6391060	total: 6.29s	remaining: 6m 3s
17:	learn: 0.6373321	total: 6.64s	remaining: 6m 2s
18:	learn: 0.6345117	total: 6.98s	remaining: 6m
19:	

In [10]:
#CatBoostFeatDrop
cols = [
    col for col in X_train.columns if not col.startswith("wvc")
]
CatBoostFeatDrop = CatBoostClassifier(random_state=42)
train_and_save_clf(CatBoostFeatDrop, cols, "CatBoostFeatDrop")


Learning rate set to 0.014415
0:	learn: 0.6895905	total: 238ms	remaining: 3m 57s
1:	learn: 0.6853986	total: 513ms	remaining: 4m 16s
2:	learn: 0.6811346	total: 862ms	remaining: 4m 46s
3:	learn: 0.6777428	total: 1.15s	remaining: 4m 47s
4:	learn: 0.6743187	total: 1.45s	remaining: 4m 49s
5:	learn: 0.6706569	total: 1.83s	remaining: 5m 3s
6:	learn: 0.6676443	total: 2.15s	remaining: 5m 5s
7:	learn: 0.6642849	total: 2.47s	remaining: 5m 6s
8:	learn: 0.6613035	total: 2.76s	remaining: 5m 3s
9:	learn: 0.6582429	total: 3.09s	remaining: 5m 5s
10:	learn: 0.6553908	total: 3.4s	remaining: 5m 6s
11:	learn: 0.6523948	total: 3.71s	remaining: 5m 5s
12:	learn: 0.6495172	total: 4.07s	remaining: 5m 8s
13:	learn: 0.6472625	total: 4.35s	remaining: 5m 6s
14:	learn: 0.6435950	total: 4.66s	remaining: 5m 5s
15:	learn: 0.6403236	total: 4.93s	remaining: 5m 2s
16:	learn: 0.6377447	total: 5.2s	remaining: 5m
17:	learn: 0.6351513	total: 5.5s	remaining: 5m
18:	learn: 0.6329328	total: 5.76s	remaining: 4m 57s
19:	learn: 0.6

Сохраним золотой стандарт:

In [11]:
gold_standard[
    ["Masked_sentence","Right_answer","Wrong_answer"]
].to_csv("gold_standard/gold_standard_input.csv", sep=";")

Вызовем из командной строки:

In [13]:
!python -m distractor_generator --filename gold_standard/gold_standard_input.csv --clf_path XGBAllFeats/clf.pkl --cols_path XGBAllFeats/cols.json --output_filename XGBAllFeats/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/76 [00:00<?, ?it/s]
 11%|█         | 8/76 [00:00<00:00, 77.18it/s]
 21%|██        | 16/76 [00:00<00:00, 74.50it/s]
 32%|███▏      | 24/76

In [14]:
!python -m distractor_generator --filename gold_standard/gold_standard_input.csv --clf_path RandomForestFreqsOnly/clf.pkl --cols_path RandomForestFreqsOnly/cols.json --output_filename RandomForestFreqsOnly/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/76 [00:00<?, ?it/s]
 11%|█         | 8/76 [00:00<00:00, 76.92it/s]
 21%|██        | 16/76 [00:00<00:00, 73.20it/s]
 32%|███▏      | 24/76

In [15]:
!python -m distractor_generator --filename gold_standard/gold_standard_input.csv --clf_path CatBoostVecsOnly/clf.pkl --cols_path CatBoostVecsOnly/cols.json --output_filename CatBoostVecsOnly/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/76 [00:00<?, ?it/s]
 11%|█         | 8/76 [00:00<00:00, 80.00it/s]
 21%|██        | 16/76 [00:00<00:00, 77.18it/s]
 32%|███▏      | 24/76

In [16]:
!python -m distractor_generator --filename gold_standard/gold_standard_input.csv --clf_path CatBoostFeatDrop/clf.pkl --cols_path CatBoostFeatDrop/cols.json --output_filename CatBoostFeatDrop/output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/76 [00:00<?, ?it/s]
 11%|█         | 8/76 [00:00<00:00, 74.77it/s]
 21%|██        | 16/76 [00:00<00:00, 74.77it/s]
 32%|███▏      | 24/76

Теперь будем сравнивать результаты

Бейзлайн - без классификатора

In [17]:
lengths = gold_standard["variants"].apply(len)
lengths.describe()

count    76.0
mean     20.0
std       0.0
min      20.0
25%      20.0
50%      20.0
75%      20.0
max      20.0
Name: variants, dtype: float64

In [18]:
Table = []

In [19]:
share_appr = gold_standard["Appropriate"].apply(len).sum()/gold_standard["variants"].apply(len).sum()
share_tg = gold_standard["Too good"].apply(len).sum()/gold_standard["variants"].apply(len).sum()
share_tb = gold_standard["Too bad"].apply(len).sum()/gold_standard["variants"].apply(len).sum()
row = {
    "method": "Baseline (no clf)",
    "Appropriate": share_appr,
    "Too bad": share_tb,
    "Too good": share_tg,
    "Appropriate (raw)": gold_standard["Appropriate"].apply(len).mean(),
    "Too bad (raw)": gold_standard["Too bad"].apply(len).mean(),
    "Too good (raw)": gold_standard["Too good"].apply(len).mean()
}
print(row)
Table.append(row)

{'method': 'Baseline (no clf)', 'Appropriate': 0.2710526315789474, 'Too bad': 0.6328947368421053, 'Too good': 0.09605263157894736, 'Appropriate (raw)': 5.421052631578948, 'Too bad (raw)': 12.657894736842104, 'Too good (raw)': 1.9210526315789473}


In [20]:
gold_standard.index

Int64Index([153493,  83294,  77723,  74220,  53390, 159237, 136331,  49472,
            145079, 112217,  37867,  51411,  73260,  75261, 147504, 139612,
             70105, 166891, 163703, 136326, 112011,  73501, 117358, 144854,
            162996,  83857,  64171,  72719,  95508,  53309,  51255, 104446,
             81630, 119391, 134577,  58525, 104023,  47846, 137728,  87222,
            151707, 150385, 101353, 142476,  69141, 164344,  40649,  74783,
             86582,  48139,  75292,  53275, 101634,  65050, 102660,  99534,
             71531, 158203,  82145, 163852, 160556, 154863,  45811, 148945,
            152775,  66435, 134088,  77717, 120146, 104386,  82379, 153675,
            138436,  73447, 160682,  42611],
           dtype='int64')

In [21]:
XGBAllFeats_out = pd.read_csv("XGBAllFeats/output.csv", index_col="Unnamed: 0", sep=';')
RandomForestFreqsOnly_out = pd.read_csv("RandomForestFreqsOnly/output.csv", index_col="Unnamed: 0", sep=';')
CatBoostVecsOnly_out = pd.read_csv("CatBoostVecsOnly/output.csv", index_col="Unnamed: 0", sep=';')
CatBoostFeatDrop_out = pd.read_csv("CatBoostFeatDrop/output.csv", index_col="Unnamed: 0", sep=';')


In [22]:
list(XGBAllFeats_out.index) == list(gold_standard.index)

True

In [23]:
list(RandomForestFreqsOnly_out.index) == list(gold_standard.index)

True

In [24]:
list(CatBoostVecsOnly_out.index) == list(gold_standard.index)

True

In [25]:
list(CatBoostFeatDrop_out.index) == list(gold_standard.index)

True

In [26]:
XGBAllFeats_out

Unnamed: 0,Masked_sentence,Right_answer,Wrong_answer,variants
153493,The amount of people who has no occupation in...,stable,the same,"['state', 'dependable', 'consistent', 'steady'..."
83294,Some politicians have come up with an idea to ...,disadvantages,backwards,"['cons', 'limitations', 'shortcomings', 'weakn..."
77723,"As for disadvantages, global warming and air ...",number,amount,"['amount', 'quantity', 'level', 'part', 'value..."
74220,It is slightly below 30°C in Yakutsk and 30°C...,trend,tendency,"['tendency', 'consistency', 'craze', 'fad', 'u..."
53390,The number of men who are aged between 15 and...,number,part,"['amount', 'quantity', 'level', 'value', 'coun..."
...,...,...,...,...
153675,"So, the national population usually becomes p...",take,move,"['make', 'go', 'get', 'taking', 'relinquish', ..."
138436,7 minutes). Just in one case women's group of ...,doing,of goind,"['making', 'for', 'getting', 'pursuing', 'acco..."
73447,The decreasing unemployment in Latin America ...,acute,sharp,"['sharp', 'chronic', 'symptomatic', 'febrile',..."
160682,Low discipline in schools tends to result in ...,improving,repairing,"['repairing', 'enhancing', 'boosting', 'mainta..."


In [27]:
def estimate_output(
    output_df: pd.DataFrame,
    method: str
):
    output_df["variants"] = output_df["variants"].apply(literal_eval)
    s = output_df["variants"].apply(len).sum()
    appr, tg, tb = 0, 0, 0
    for idx in gold_standard.index:
        appr += len(set(output_df.loc[idx]["variants"]) & set(gold_standard.loc[idx]["Appropriate"]))
        tg += len(set(output_df.loc[idx]["variants"]) & set(gold_standard.loc[idx]["Too good"]))
        tb += len(set(output_df.loc[idx]["variants"]) & set(gold_standard.loc[idx]["Too bad"]))
    
    return {
        "method": method,
        "Appropriate": appr/s,
        "Too bad": tb/s,
        "Too good": tg/s
    }


In [28]:
for df, name in zip(
    [XGBAllFeats_out, RandomForestFreqsOnly_out, CatBoostVecsOnly_out, CatBoostFeatDrop_out],
    ["XGBAllFeats_out", "RandomForestFreqsOnly_out", "CatBoostVecsOnly_out", "CatBoostFeatDrop_out"]
):
    Table.append(estimate_output(df,name))

In [29]:
Table = pd.DataFrame(Table)

In [30]:
Table.sort_values(by=["Appropriate"], ascending=False)

Unnamed: 0,method,Appropriate,Too bad,Too good,Appropriate (raw),Too bad (raw),Too good (raw)
1,XGBAllFeats_out,0.292776,0.601711,0.103612,,,
2,RandomForestFreqsOnly_out,0.291705,0.60711,0.097539,,,
4,CatBoostFeatDrop_out,0.289116,0.60119,0.106293,,,
3,CatBoostVecsOnly_out,0.275404,0.614435,0.108262,,,
0,Baseline (no clf),0.271053,0.632895,0.096053,5.421053,12.657895,1.921053


In [31]:
Table.to_excel("data/gold_standard_performance.xlsx")

Ещё - можно попробовать поиграться с N - брать от 3 до 20 - можно заново не проводить классификацию, просто брать дистракторы в порядке старшинства

In [32]:
gold_standard["variants"]

153493    [state, dependable, consistent, steady, prospe...
83294     [cons, limitations, shortcomings, weaknesses, ...
77723     [amount, quantity, level, part, value, member,...
74220     [tendency, consistency, phenomenon, resurgence...
53390     [amount, quantity, level, value, member, count...
                                ...                        
153675    [make, drink, get, move, share, go, taking, gi...
138436    [making, for, getting, pursuing, accomplishing...
73447     [sharp, chronic, symptomatic, febrile, respira...
160682    [repairing, enhancing, reducing, strengthening...
42611     [meal, goal, seafood, meat, nutrition, beverag...
Name: variants, Length: 76, dtype: object

In [33]:
def estimate_ouput_with_N(
    output_df: pd.DataFrame,
    N: int,
    method: str
):
    gs1 = gold_standard.copy()
    df1 = output_df.copy()

    gs1["variants"] = gs1["variants"].apply(lambda x: x[:N])
    gs1["Appropriate"] = gs1.apply(
        lambda x: [i for i in x["Appropriate"] if i in x["variants"]],
        axis=1
    )
    gs1["Too good"] = gs1.apply(
        lambda x: [i for i in x["Too good"] if i in x["variants"]],
        axis=1
    )
    gs1["Too bad"] = gs1.apply(
        lambda x: [i for i in x["Too bad"] if i in x["variants"]],
        axis=1
    )

    df1["variants"] = df1.apply(
        lambda x: [i for i in x["variants"] if i in gs1.loc[x.name]["variants"]],
        axis=1
    )

    s = df1["variants"].apply(len).sum()
    c = df1["variants"].apply(len).mean()
    appr, tg, tb = 0, 0, 0
    appr_sent = []
    tg_sent = []
    tb_sent = []
    appr_raw, tg_raw, tb_raw = [], [], []

    for idx in gs1.index:
        appr_i = len(set(df1.loc[idx]["variants"]) & set(gs1.loc[idx]["Appropriate"]))
        tg_i = len(set(df1.loc[idx]["variants"]) & set(gs1.loc[idx]["Too good"]))
        tb_i = len(set(df1.loc[idx]["variants"]) & set(gs1.loc[idx]["Too bad"]))

        appr += appr_i
        tg += tg_i
        tb += tb_i

        if df1.loc[idx]["variants"]:
            appr_sent.append(appr_i/len(df1.loc[idx]["variants"]))
            tg_sent.append(tg_i/len(df1.loc[idx]["variants"]))
            tb_sent.append(tb_i/len(df1.loc[idx]["variants"]))

            appr_raw.append(appr_i)
            tg_raw.append(tg_i)
            tb_raw.append(tb_i)
        else:
            appr_sent.append(0)
            tg_sent.append(0)
            tb_sent.append(0)

            appr_raw.append(0)
            tg_raw.append(0)
            tb_raw.append(0)
    
    return {
        "method": method,
        "N": N,
        "Appropriate (whole)": appr/s,
        "Too bad (whole)": tb/s,
        "Too good (whole)": tg/s,
        "Appropriate (by sent)": pd.Series(appr_sent).mean(),
        "Too bad (by sent)": pd.Series(tb_sent).mean(),
        "Too good (by sent)": pd.Series(tg_sent).mean(),
        "Appropriate (raw mean by sent)":  pd.Series(appr_raw).mean(),
        "Too bad (raw mean by sent)": pd.Series(tb_raw).mean(),
        "Too good (raw mean by sent):": pd.Series(tg_raw).mean(),
        "N distractors": c
    }


Будем перебирать N:

In [34]:
Table1 = []

for N in tqdm_notebook(range(3, 21), total=18):
    for df, name in zip(
        [gold_standard, XGBAllFeats_out, RandomForestFreqsOnly_out, CatBoostVecsOnly_out, CatBoostFeatDrop_out],
        ["Baseline (no clf)", "XGBAllFeats_out", "RandomForestFreqsOnly_out", "CatBoostVecsOnly_out", "CatBoostFeatDrop_out"]
    ):
        Table1.append(
            estimate_ouput_with_N(df, N, name)
        )

Table1 = pd.DataFrame(Table1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for N in tqdm_notebook(range(3, 21), total=18):


  0%|          | 0/18 [00:00<?, ?it/s]

In [35]:
Table1.sort_values(by=["Appropriate (whole)"], ascending=False)

Unnamed: 0,method,N,Appropriate (whole),Too bad (whole),Too good (whole),Appropriate (by sent),Too bad (by sent),Too good (by sent),Appropriate (raw mean by sent),Too bad (raw mean by sent),Too good (raw mean by sent):,N distractors
1,XGBAllFeats_out,3,0.581152,0.293194,0.125654,0.598684,0.276316,0.125000,1.460526,0.736842,0.315789,2.513158
3,CatBoostVecsOnly_out,3,0.578035,0.294798,0.127168,0.574561,0.271930,0.127193,1.315789,0.671053,0.289474,2.276316
4,CatBoostFeatDrop_out,3,0.546798,0.315271,0.137931,0.550439,0.304825,0.144737,1.460526,0.842105,0.368421,2.671053
8,CatBoostVecsOnly_out,4,0.542986,0.325792,0.131222,0.536184,0.314693,0.122807,1.578947,0.947368,0.381579,2.907895
6,XGBAllFeats_out,4,0.536585,0.341463,0.121951,0.561404,0.326754,0.111842,1.736842,1.105263,0.394737,3.236842
...,...,...,...,...,...,...,...,...,...,...,...,...
75,Baseline (no clf),18,0.287281,0.611111,0.101608,0.287281,0.611111,0.101608,5.171053,11.000000,1.828947,18.000000
83,CatBoostVecsOnly_out,19,0.284148,0.605184,0.110668,0.329655,0.574817,0.095529,3.750000,7.986842,1.460526,13.197368
80,Baseline (no clf),19,0.279778,0.621191,0.099030,0.279778,0.621191,0.099030,5.315789,11.802632,1.881579,19.000000
88,CatBoostVecsOnly_out,20,0.275928,0.615604,0.108468,0.321604,0.584302,0.094094,3.815789,8.513158,1.500000,13.828947


In [36]:
Table1.sort_values(by=["Appropriate (by sent)"], ascending=False)

Unnamed: 0,method,N,Appropriate (whole),Too bad (whole),Too good (whole),Appropriate (by sent),Too bad (by sent),Too good (by sent),Appropriate (raw mean by sent),Too bad (raw mean by sent),Too good (raw mean by sent):,N distractors
1,XGBAllFeats_out,3,0.581152,0.293194,0.125654,0.598684,0.276316,0.125000,1.460526,0.736842,0.315789,2.513158
3,CatBoostVecsOnly_out,3,0.578035,0.294798,0.127168,0.574561,0.271930,0.127193,1.315789,0.671053,0.289474,2.276316
6,XGBAllFeats_out,4,0.536585,0.341463,0.121951,0.561404,0.326754,0.111842,1.736842,1.105263,0.394737,3.236842
4,CatBoostFeatDrop_out,3,0.546798,0.315271,0.137931,0.550439,0.304825,0.144737,1.460526,0.842105,0.368421,2.671053
8,CatBoostVecsOnly_out,4,0.542986,0.325792,0.131222,0.536184,0.314693,0.122807,1.578947,0.947368,0.381579,2.907895
...,...,...,...,...,...,...,...,...,...,...,...,...
70,Baseline (no clf),17,0.298762,0.597523,0.103715,0.298762,0.597523,0.103715,5.078947,10.157895,1.763158,17.000000
87,RandomForestFreqsOnly_out,20,0.292772,0.609332,0.097896,0.296627,0.610794,0.092579,4.210526,8.763158,1.407895,14.381579
75,Baseline (no clf),18,0.287281,0.611111,0.101608,0.287281,0.611111,0.101608,5.171053,11.000000,1.828947,18.000000
80,Baseline (no clf),19,0.279778,0.621191,0.099030,0.279778,0.621191,0.099030,5.315789,11.802632,1.881579,19.000000


In [37]:
Table1.loc[Table1["N distractors"]>3].sort_values(by=["Appropriate (by sent)"], ascending=False)

Unnamed: 0,method,N,Appropriate (whole),Too bad (whole),Too good (whole),Appropriate (by sent),Too bad (by sent),Too good (by sent),Appropriate (raw mean by sent),Too bad (raw mean by sent),Too good (raw mean by sent):,N distractors
6,XGBAllFeats_out,4,0.536585,0.341463,0.121951,0.561404,0.326754,0.111842,1.736842,1.105263,0.394737,3.236842
11,XGBAllFeats_out,5,0.496622,0.385135,0.118243,0.535526,0.357675,0.106798,1.934211,1.500000,0.460526,3.894737
9,CatBoostFeatDrop_out,4,0.515267,0.347328,0.137405,0.526316,0.333333,0.140351,1.776316,1.197368,0.473684,3.447368
16,XGBAllFeats_out,6,0.468023,0.409884,0.122093,0.510746,0.378509,0.110746,2.118421,1.855263,0.552632,4.526316
21,XGBAllFeats_out,7,0.459184,0.420918,0.119898,0.502068,0.390257,0.107675,2.368421,2.171053,0.618421,5.157895
...,...,...,...,...,...,...,...,...,...,...,...,...
70,Baseline (no clf),17,0.298762,0.597523,0.103715,0.298762,0.597523,0.103715,5.078947,10.157895,1.763158,17.000000
87,RandomForestFreqsOnly_out,20,0.292772,0.609332,0.097896,0.296627,0.610794,0.092579,4.210526,8.763158,1.407895,14.381579
75,Baseline (no clf),18,0.287281,0.611111,0.101608,0.287281,0.611111,0.101608,5.171053,11.000000,1.828947,18.000000
80,Baseline (no clf),19,0.279778,0.621191,0.099030,0.279778,0.621191,0.099030,5.315789,11.802632,1.881579,19.000000


In [38]:
Table1.to_csv("data/ParamAndClfSelection.csv", sep=';')
Table1.to_excel("data/ParamAndClfSelection.xlsx", float_format="%.4f")

Получим аутпут от лучшей модели:

In [39]:
!python -m distractor_generator --n 4 --filename gold_standard/gold_standard_input.csv --clf_path XGBAllFeats/clf.pkl --cols_path XGBAllFeats/cols.json --output_filename gold_standard/best_model_output.csv

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/76 [00:00<?, ?it/s]
 50%|█████     | 38/76 [00:00<00:00, 380.00it/s]
100%|██████████| 76/76 [00:00<00:00, 373.41it/s]
100%|██████████| 76

In [40]:
import numpy as np

df = pd.read_csv(
    "data/dataset_lexics_final3.csv",
    sep=';',
    index_col="Unnamed: 0"
)

df = df.loc[df["Delete"]!=1.0]

df = df.drop(["Delete","Revisited1"], axis=1)

df = df.dropna(subset=["target_true"])
df["target_true"] = df["target_true"].astype(np.double).astype(np.int64)
df["target"] = df["target"].astype(np.double).astype(np.int64)

In [41]:
df

Unnamed: 0_level_0,sent_id,target,variant,correction,masked_sent,variant_count,correction_count,error_type,target_true,File,Folder,Filename
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1167,88.0,1,understandings,perceptions,"To start with, happiness is a feeling of comf...",1,1,lex_item_choice,1,2014_EZa_13_2,exam/Exam2014,exam/Exam2014/2014_EZa_13_2
1173,93.0,1,strictly,absolutely,I know that she was healthy and that her birt...,2,5,lex_item_choice,1,2014_EZa_13_2,exam/Exam2014,exam/Exam2014/2014_EZa_13_2
1174,93.0,0,completely,absolutely,I know that she was healthy and that her birt...,2,5,lex_item_choice,0,2014_EZa_13_2,exam/Exam2014,exam/Exam2014/2014_EZa_13_2
1187,93.0,0,definitely,absolutely,I know that she was healthy and that her birt...,1,5,lex_item_choice,1,2014_EZa_13_2,exam/Exam2014,exam/Exam2014/2014_EZa_13_2
2332,171.0,0,number,level,The chart below represents the information abo...,13,39,lex_item_choice,1,2017_OBy_85_1,exam/Exam2017/OBy_1-99,exam/Exam2017/OBy_1-99/2017_OBy_85_1
...,...,...,...,...,...,...,...,...,...,...,...,...
695022,37399.0,0,bring,lead,That people should reduce the amount of air t...,2,10,lex_item_choice,1,2016_JSl_45_2,exam/Exam2016,exam/Exam2016/2016_JSl_45_2
695024,37399.0,0,leave,lead,That people should reduce the amount of air t...,2,10,lex_item_choice,0,2016_JSl_45_2,exam/Exam2016,exam/Exam2016/2016_JSl_45_2
695025,37399.0,0,provide,lead,That people should reduce the amount of air t...,1,10,lex_item_choice,0,2016_JSl_45_2,exam/Exam2016,exam/Exam2016/2016_JSl_45_2
695026,37399.0,1,result,lead,That people should reduce the amount of air t...,1,10,lex_item_choice,1,2016_JSl_45_2,exam/Exam2016,exam/Exam2016/2016_JSl_45_2


In [42]:
df["sent_id"].nunique()

782

In [43]:
len(df)/df["sent_id"].nunique()

3.629156010230179

In [44]:
len(df[df["target_true"] == 1.0])/df["sent_id"].nunique()

1.7135549872122762