In [1]:
from transformers import pipeline 
from datasets import load_dataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
unmasker = pipeline("fill-mask", model="distilbert-base-uncased")

2022-06-17 16:49:46.151982: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
data = load_dataset("csv", data_files={'train': ["../data/cmv_train.csv"], 'test': ["../data/cmv_test.csv"]})

Using custom data configuration default-6e70f294c7d5c8fc


Downloading and preparing dataset csv/default to /home/georg/.cache/huggingface/datasets/csv/default-6e70f294c7d5c8fc/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/georg/.cache/huggingface/datasets/csv/default-6e70f294c7d5c8fc/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
prompt_templates = ["[INP]. This should be [MASK] knowlege",
                    "[INP]. This sentence contains [MASK] proof and results.",
                    "[INP]. This sentence contains a statement from a expert, authority, witness, group, organisation or similar? [MASK].",
                    "[INP]. This is something I have [MASK] from personal experience.",
                    "[INP]. That is a cool [MASK]."]

def apply_promts(row):
    answers = []
    for prompt_template in prompt_templates:
        # replace [INP] with the input sentence
        prompt_template = prompt_template.replace("[INP]", row["sentence"])
        
        answer = unmasker(prompt_template)[0]
        answers.append((str(answer["score"]), str(answer["token_str"])))

    return {"answers": answers}


probed_data = data["train"].map(apply_promts)

  0%|          | 0/2030 [00:00<?, ?ex/s]

In [5]:

answer_counts = {}

for answer in probed_data["answers"]:
    for template, answer in zip(prompt_templates, answer):
        if template not in answer_counts:
            answer_counts[template] = {}
        
        if answer[1] not in answer_counts[template]:
            answer_counts[template][answer[1]] = 0
        
        answer_counts[template][answer[1]] += 1


In [6]:
def get_possible_answers(answer_counts, cutoff=10):
    filtered_answers = {key: {} for key in answer_counts}

    for prompt, answers in answer_counts.items():
        for question, count in answers.items():
            if count > cutoff:
                filtered_answers[prompt][question] = count

            else:
                if "_other" not in filtered_answers[prompt]:
                    filtered_answers[prompt]["_other"] = count
                else:
                    filtered_answers[prompt]["_other"] += count

    return filtered_answers

filtered_answer = get_possible_answers(answer_counts)

In [7]:
filtered_answer

{'[INP]. This should be [MASK] knowlege': {'called': 1238,
  'your': 47,
  'my': 41,
  'considered': 185,
  'our': 23,
  'a': 398,
  '_other': 63,
  'david': 15,
  'termed': 20},
 '[INP]. This sentence contains [MASK] proof and results.': {'sufficient': 576,
  'some': 28,
  'simple': 364,
  'explicit': 17,
  'no': 552,
  'excellent': 23,
  '_other': 69,
  'insufficient': 75,
  'ample': 148,
  'both': 112,
  'detailed': 54,
  'contradictory': 12},
 '[INP]. This sentence contains a statement from a expert, authority, witness, group, organisation or similar? [MASK].': {')': 937,
  '.': 923,
  'etc': 80,
  ']': 15,
  'person': 54,
  '"': 12,
  '_other': 9},
 '[INP]. This is something I have [MASK] from personal experience.': {'learned': 2015,
  'learnt': 15},
 '[INP]. That is a cool [MASK].': {'idea': 382,
  'thing': 886,
  'question': 409,
  '_other': 156,
  'rule': 61,
  'climate': 13,
  'book': 11,
  'proposition': 11,
  'statement': 40,
  'word': 25,
  'option': 19,
  'assumption': 17}

In [8]:
def get_feature_vector(answers, filtered_answer):
    data  = []

    for answer in answers:
        sub_result = []
        for prompt, answer in zip(prompt_templates, answer):
            if answer[1] in filtered_answer[prompt]:
                sub_result.append(answer[1])
            else:
                sub_result.append("_other")

        data.append(sub_result)

    return data

features = get_feature_vector(probed_data["answers"], filtered_answer)


In [9]:
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features)

In [10]:
rfc = SVC()
rfc.fit(encoded_features, probed_data["label"])

In [11]:
validation = data["test"].map(apply_promts)

  0%|          | 0/1354 [00:00<?, ?ex/s]

In [12]:
validation_features = get_feature_vector(validation["answers"], filtered_answer)
validation_encoded_features = encoder.transform(validation_features)

pred = rfc.predict(validation_encoded_features)

print(classification_report(validation["label"], pred))

              precision    recall  f1-score   support

    anecdote       0.33      0.01      0.03       148
  assumption       0.85      1.00      0.92      1145
       other       0.00      0.00      0.00        15
  statistics       0.67      0.06      0.11        33
   testimony       0.00      0.00      0.00        13

    accuracy                           0.85      1354
   macro avg       0.37      0.21      0.21      1354
weighted avg       0.77      0.85      0.78      1354



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
