In [12]:
from transformers import pipeline 
from datasets import load_dataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [13]:
unmasker = pipeline("fill-mask", model="distilbert-base-uncased")

In [14]:
data = load_dataset("csv", data_files={'train': ["../data/corpus_train.csv"], 'validation': ["../data/corpus_valid.csv"], "test": ["../data/corpus_test.csv"]})

Using custom data configuration default-2c05e3db047a88f3
Reusing dataset csv (/home/georg/.cache/huggingface/datasets/csv/default-2c05e3db047a88f3/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
prompt_templates = ["[INP]. This should be [MASK] knowlege",
                    "[INP]. This sentence contains [MASK] proof and results.",
                    "[INP]. This sentence contains a statement from a expert, authority, witness, group, organisation or similar? [MASK].",
                    "[INP]. This is something I have [MASK] from personal experience.",
                    "[INP]. That is a cool [MASK]."]

def apply_promts(row):
    answers = []
    for prompt_template in prompt_templates:
        # replace [INP] with the input sentence
        prompt_template = prompt_template.replace("[INP]", row["sentence"])
        
        answer = unmasker(prompt_template)[0]
        answers.append((str(answer["score"]), str(answer["token_str"])))

    return {"answers": answers}


probed_data = data["train"].map(apply_promts)

  0%|          | 0/9505 [00:00<?, ?ex/s]

In [16]:

answer_counts = {}

for answer in probed_data["answers"]:
    for template, answer in zip(prompt_templates, answer):
        if template not in answer_counts:
            answer_counts[template] = {}
        
        if answer[1] not in answer_counts[template]:
            answer_counts[template][answer[1]] = 0
        
        answer_counts[template][answer[1]] += 1


In [17]:
def get_possible_answers(answer_counts, cutoff=10):
    filtered_answers = {key: {} for key in answer_counts}

    for prompt, answers in answer_counts.items():
        for question, count in answers.items():
            if count > cutoff:
                filtered_answers[prompt][question] = count

            else:
                if "_other" not in filtered_answers[prompt]:
                    filtered_answers[prompt]["_other"] = count
                else:
                    filtered_answers[prompt]["_other"] += count

    return filtered_answers

filtered_answer = get_possible_answers(answer_counts)

In [18]:
filtered_answer

{'[INP]. This should be [MASK] knowlege': {'called': 7010,
  'david': 160,
  'considered': 860,
  'a': 436,
  'termed': 221,
  'my': 116,
  '_other': 325,
  'your': 91,
  'our': 116,
  'miss': 41,
  'his': 11,
  'mr': 48,
  'read': 25,
  'mrs': 11,
  'cameron': 11,
  'steve': 12,
  'putin': 11},
 '[INP]. This sentence contains [MASK] proof and results.': {'simple': 849,
  'ample': 578,
  'no': 3205,
  'insufficient': 1928,
  'sufficient': 1181,
  'both': 406,
  'contradictory': 282,
  'detailed': 523,
  'some': 23,
  'false': 111,
  'numerous': 152,
  'more': 18,
  'important': 15,
  'remarkable': 33,
  '_other': 47,
  'explicit': 39,
  'excellent': 37,
  'additional': 17,
  'incorrect': 19,
  'surprising': 25,
  'many': 17},
 '[INP]. This sentence contains a statement from a expert, authority, witness, group, organisation or similar? [MASK].': {'etc': 695,
  ')': 3711,
  'witness': 2078,
  'member': 285,
  'person': 1309,
  'participant': 626,
  'expert': 242,
  'statement': 15,
  'pr

In [19]:
def get_feature_vector(answers, filtered_answer):
    data  = []

    for answer in answers:
        sub_result = []
        for prompt, answer in zip(prompt_templates, answer):
            if answer[1] in filtered_answer[prompt]:
                sub_result.append(answer[1])
            else:
                sub_result.append("_other")

        data.append(sub_result)

    return data

features = get_feature_vector(probed_data["answers"], filtered_answer)


In [20]:
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features)

In [21]:
rfc = RandomForestClassifier()
rfc.fit(encoded_features, probed_data["label"])

RandomForestClassifier()

In [22]:
validation = data["validation"].map(apply_promts)

  0%|          | 0/2716 [00:00<?, ?ex/s]

In [24]:
validation_features = get_feature_vector(validation["answers"], filtered_answer)
validation_encoded_features = encoder.transform(validation_features)

pred = rfc.predict(validation_encoded_features)

print(classification_report(validation["label"], pred))

               precision    recall  f1-score   support

     anecdote       0.40      0.22      0.28       458
   assumption       0.73      0.91      0.81      1826
common-ground       0.00      0.00      0.00        66
        other       0.25      0.03      0.06        30
   statistics       0.45      0.15      0.23        66
    testimony       0.62      0.39      0.48       219
        title       0.73      0.31      0.44        51

     accuracy                           0.69      2716
    macro avg       0.45      0.29      0.33      2716
 weighted avg       0.64      0.69      0.65      2716



## Storing probed answers to disk to experiment with classification

In [38]:
import pickle

validation_pkl = (validation_features, validation["label"])

with open("../data/probed/validation_filtered.pkl", "wb") as f:
    pickle.dump(validation_pkl, f)

validation_raw_answers = validation["answers"]

with open("../data/probed/validation_raw_answers.pkl", "wb") as f:
    pickle.dump(validation_raw_answers, f)

train_pkl = (features, probed_data["label"])

with open("../data/probed/train_filtered.pkl", "wb") as f:
    pickle.dump(train_pkl, f)

train_raw_answers = probed_data["answers"]

with open("../data/probed/train_raw_answers.pkl", "wb") as f:
    pickle.dump(train_raw_answers, f)
