In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
!pip install transformers # If you are using collab, "!" is required to download
!pip install bert-score
!pip install -U sentence-transformers

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import os, json
import numpy as np
from argparse import ArgumentParser
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from pprint import pprint
import re, string
from google.colab import files

from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util
import torch
import copy

## Calculating evaluation scores

In [None]:
def read_json_file(name):
    with open(name, 'r') as f:
        data = json.load(f)
        return data

def save_json(json_data, file_name):
    json_data = json.dumps(json_data)
    print(file_name)
    with open(file_name, "w") as json_file:
          json_file.write(json_data)

In [None]:
# def perform_lemmatization(text):

#     # Tokenize the text into words
#     words = word_tokenize(text)
#     lemmatizer = WordNetLemmatizer()
#     stemmer = PorterStemmer()
#     lemmatized_and_stemmed_words = [(lemmatizer.lemmatize(word), stemmer.stem(word)) for word in words]

#     lemmatized_words, stemmed_words = zip(*lemmatized_and_stemmed_words)
#     lemmatized_text = ' '.join(lemmatized_words)
#     stemmed_text = ' '.join(stemmed_words)

#     # print("Original text:", text)
#     # print("Lemmatized text:", lemmatized_text)
#     # print("Stemmed text:", stemmed_text)
#     return stemmed_text

def clac_sbert_score(text1, text2):
    model = SentenceTransformer("all-MiniLM-L6-v2")

    embeddings1 = model.encode(text1, convert_to_tensor=True)
    embeddings2 = model.encode(text2, convert_to_tensor=True)

    cosine_score = util.cos_sim(embeddings1, embeddings2)
    #print(cosine_score.item())
    return cosine_score.item()

def calc_cosine_similarity(text1, text2):

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    # text1 = perform_lemmatization(text1)
    # text2 = perform_lemmatization(text2)

    inputs1 = tokenizer(text1, return_tensors="pt", padding=True, truncation=True)
    inputs2 = tokenizer(text2, return_tensors="pt", padding=True, truncation=True)

    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

    embeddings1 = outputs1.last_hidden_state.mean(dim=1).detach().numpy()
    embeddings2 = outputs2.last_hidden_state.mean(dim=1).detach().numpy()

    #Calculate cosine similarity
    similarity = np.dot(embeddings1, embeddings2.T) / (np.linalg.norm(embeddings1) * np.linalg.norm(embeddings2))

    #print("Similarity between the texts:", similarity)
    return similarity[0][0]


In [None]:
def is_null_or_empty(string):
    return string is None or string.strip().lower() == 'null' or string.strip() == ''

def normalize_string(s):
      def remove_articles(text):
          regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
          return re.sub(regex, ' ', text)
      def white_space_fix(text):
          return ' '.join(text.split())
      def remove_punc(text):
          exclude = set(string.punctuation)
          return ''.join(ch for ch in text if ch not in exclude)
      def lower(text):
          return text.lower()
      return white_space_fix(remove_articles(remove_punc(lower(s))))


def calculate_score(predictions, ground_truth_labels, similarity_threshold):
    true_pos, pred_count, gtruth_count = 0, len(predictions), len(ground_truth_labels)

    if ((len(ground_truth_labels)==1 and is_null_or_empty(ground_truth_labels[0]))): #only not calculating when ground-truth is null
        #print("xxx ", predictions, labels) #(len(predictions)==1 and is_null_or_empty(predictions[0]))
        return 0, 0, 0 #skipping this sample
    else:
        actual_labels = copy.deepcopy(ground_truth_labels)
        # print(predictions, actual_labels)
        # print(len(predictions), len(actual_labels))
        for p in predictions:
            for g in actual_labels:
                if(g.strip() == ''): gtruth_count -=1 ## if any ground-truth is empty reducing that count.
                if(is_null_or_empty(p) or is_null_or_empty(g)): continue ##if prediction/ground-truth is null then skip
                np, ng = normalize_string(p), normalize_string(g)
                sim_value = calc_cosine_similarity(np,ng)
                sbert_score = clac_sbert_score(np, ng)
                print(np, ng, sim_value, sbert_score)
                if(sim_value>similarity_threshold or sbert_score>similarity_threshold):
                    true_pos += 1
                    actual_labels.remove(g)
        # print(predictions, labels)
        # print(true_pos, pred_count, gtruth_count)
        return true_pos, pred_count, gtruth_count

def calc_precision_recall_f1(true_pos, pred_count, gtruth_count):
    print(true_pos, pred_count, gtruth_count)
    precision = true_pos/(pred_count + 1e-10)
    recall = true_pos/(gtruth_count + 1e-10)
    f1 = 2 * precision * recall / (precision + recall + 1e-10)
    return precision, recall, f1

In [None]:
def select_event_specific_data(event, data):
    ev_data = []
    for dt in data:
        if event==dt['label']:
            ev_data.append(dt)
    return ev_data

def calc_results(all_data, event_types, arg_types, events_roles, pred_name, gtruth_name, similarity_threshold):
    results = {}
    for event in event_types:
        results[event] = {}
        for arg_type in arg_types:
            results[event][arg_type] = {}
            arguments = list(events_roles[event][arg_type].keys())
            for arg in arguments:
                tp, pc, gc = 0, 0, 0 #initialize the true_positive, pred_count, ground_truth-count for each arguments
                event_specifc_data = select_event_specific_data(event, all_data) ##geting the data for a specific event type.
                for dt in event_specifc_data:
                    label = dt['label']
                    pred_list = dt[pred_name][arg_type][arg] #predictions
                    gtruth_list = dt[gtruth_name][arg_type][arg] #ground-truth arguments
                    #print(pred_list, gtruth_list)
                    true_pos, pred_count, gtruth_count = 0, 0, 0
                    try:
                       true_pos, pred_count, gtruth_count = calculate_score(pred_list,
                                                                         gtruth_list, similarity_threshold)
                    except:
                        print(event, arg_type, arg)
                        print(pred_list, gtruth_list)
                        print("Error occured at", dt['doc_id'])

                    tp+= true_pos
                    pc+= pred_count
                    gc+= gtruth_count
                if(gc==0):
                    continue #skiping those values, where we do not have any ground truth.

                precision, recall, f1 = calc_precision_recall_f1(tp, pc, gc)
                print(event, arg_type, arg, precision, recall, f1)
                #saving the values in two items is a list
                #first item has (precision, recall, f1 scores) #second item has (true_pos, pred_count, ground_truth_count values)
                results[event][arg_type][arg]= [[precision, recall, f1], [tp, pc, gc]]

    return results


In [None]:
%%time

events_roles = read_json_file('Final-Data/role_definitions.json')

prompt_types = ['batch'] #['batch', 'isolated']
exp_types = ['description_guided', 'question_guided']
event_types = ['taking-moud', 'tapering', 'relapse']
arg_types = ['main-arguments', 'event-specific-arguments', 'subject-effect-arguments']
similarity_threshold = [0.749, 0.99]

number = '1'
name = 'gemma-1.1-7b-it'
folder_path = 'Argument-Extraction-Predictions'
file_name = f'{number}-processed-{name}_test_100_predictions.json'
print(os.path.join(folder_path, file_name))
all_data = read_json_file(os.path.join(folder_path, file_name))

print(all_data[0].keys())
print(len(all_data))

/content/drive/Shareddrives/Reddit Misinformation/Data/New-Avg-Run-Experiments/Argument-Extraction-Predictions/1-processed-gemma-1.1-7b-it_test_100_predictions.json
dict_keys(['doc_id', 'id', 'text1', 'text2', 'label', 'thread-length', 'ground-truth-arguments', 'raw-batch_description_guided-predictions', 'raw-batch_question_guided-predictions', 'processed-batch_description_guided-predictions', 'processed-batch_question_guided-predictions'])
100
CPU times: user 21.1 ms, sys: 1.41 ms, total: 22.5 ms
Wall time: 34.6 ms


In [None]:
%%time
numbers = ['0', '1', '2']

for number in numbers:
    file_name = f'{number}-processed-{name}_test_100_predictions.json'
    all_data = read_json_file(os.path.join(folder_path, file_name))

    results_dict = {}
    for p_type in prompt_types:
          for e_type in exp_types:
              g_name = 'ground-truth-arguments'
              p_name = f'processed-{p_type}_{e_type}-predictions'
              for s_t in similarity_threshold:
                  print(p_type, e_type, s_t)
                  results = calc_results(all_data, event_types, arg_types, events_roles,
                            p_name, g_name, s_t)
                  save_json(results, f'results-{name}{p_type}_{e_type}_{s_t}.json')
                  results_dict[f'{p_type}_{e_type}_{s_t}']=results


    result_folder_path = 'Argument-Extraction-Results'
    save_json(results_dict, os.path.join(result_folder_path,f'{number}-new-results-{name}.json'))

batch description_guided 0.749


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
15 years 15 years 1.0000001 1.0
6 months 6 months 1.0 1.0
since october since october 1.0 0.9999998807907104
12 years 3 days 0.781991 0.5112540125846863
1 year one year 0.8527212 0.9410743713378906
11 days 11 days 0.9999999 1.0
3 years and 3 months 3 years and 3 months 1.0 1.000000238418579
1 week week 0.7706762 0.7918719053268433
about year about year 0.99999994 1.0
several months last several months 0.81528795 0.747855544090271
4 days 4 days 1.0 1.0
4 years 4 years 1.0 1.0
8 12 12
taking-moud event-specific-arguments treatment-duration 0.6666666666611111 0.6666666666611111 0.6666666666111111
snorting sublingual 0.56614715 0.22496460378170013
swallowing spit ublingually 0.664814 0.2580336630344391
sublingually absorbed through veins under tongue 0.57189965 0.14504988491535187
sublingual oralsublingual 0.7256082 0.5094863176345825
spitting orally 0.70546764 0.39229917526245117
oralsublingual between gumslipsplugged 0.6027

printing to understanding the nesting in the results dict

In [None]:
print(results_dict.keys())
print(results_dict['batch_description_guided_0.749'].keys())
print(results_dict['batch_description_guided_0.749']['tapering'].keys())
print(results_dict['batch_description_guided_0.749']['tapering']['main-arguments'].keys())
print(results_dict['batch_description_guided_0.749']['tapering']['main-arguments']['effects'])

dict_keys(['batch_description_guided_0.749', 'batch_description_guided_0.99', 'batch_question_guided_0.749', 'batch_question_guided_0.99', 'isolated_description_guided_0.749', 'isolated_description_guided_0.99', 'isolated_question_guided_0.749', 'isolated_question_guided_0.99'])
dict_keys(['taking-moud', 'tapering', 'relapse'])
dict_keys(['main-arguments', 'event-specific-arguments', 'subject-effect-arguments'])
dict_keys(['subject/patient', 'effects', 'tapering-event'])
[[0.4583333333314236, 0.54999999999725, 0.49999999994814043], [11, 24, 20]]
