In [1]:
!pip install pandas transformers guidance numpy evaluate rouge-score nltk py-rouge accelerate

Collecting guidance
  Downloading guidance-0.1.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (223 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-rouge
  Downloading py_rouge-1.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting diskcache (from guidance

In [2]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import json
import nltk
import numpy as np
from guidance import gen
from evaluate import load
import csv
import nltk.translate.bleu_score as bleu
from rouge import Rouge
from transformers import BertTokenizer, BertModel
import torch
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import warnings
from typing import Any, Dict, Tuple
import re
import os

warnings.filterwarnings("ignore", category=UserWarning, message="A new version of the following files was downloaded from")

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np

def majority_vote(csv_files, output_file):
    # Initialize an empty DataFrame to store the results
    result_df = pd.DataFrame(columns=['id', 'label'])

    # Iterate over each CSV file
    for csv_file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file)

        # Rename the 'label' column to differentiate between files
        df.rename(columns={'label': f'label_{csv_files.index(csv_file)}'}, inplace=True)

        # Merge the DataFrame with the result DataFrame
        result_df = pd.merge(result_df, df, on='id', how='outer')
    # Compute the majority vote
    result_df['label'] = result_df.iloc[:, 1:].mode(axis=1)[0]

    # Replace labels other than 'hyp1' and 'hyp2' with NaN
    result_df.loc[~result_df['label'].isin(['hyp1', 'hyp2']), 'label'] = np.nan

    df_temp = result_df.drop(columns=['id'])

    for index, row in result_df.iterrows():
      if pd.isna(row['label']):
        values = df_temp.iloc[index].dropna().values
        result_df.at[index, 'label'] = values[0]


    # Write the result to a new CSV file
    result_df[['id', 'label']].to_csv(output_file, index=False)



## Paraphrasing Task Files

In [9]:
# Example usage:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_llama3_en_v1_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt4_en_v2_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt_en_narjes_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gemma_en_vnarjes.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/majority_vote_result_en_narjes.csv'
majority_vote(csv_files, output_file)

### Voting narjes prompt

**English**

In [16]:
# Example usage:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_llama3_prompt_narjes_en_v1_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt4_en_v2_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt_en_narjes_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gemma_en_vnarjes.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/majority_vote_result_en_narjes.csv'
majority_vote(csv_files, output_file)

  result_df['label'] = result_df.iloc[:, 1:].mode(axis=1)[0]


**Swedish**

In [17]:
# Example usage:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_llama3_prompt_narjes_se_v2_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt4_se_v1_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt_se_narjes_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gemma_se_vnarjes.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/majority_vote_result_se_narjes.csv'
majority_vote(csv_files, output_file)

## Translation Task Files

In [None]:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/Llama3/results_llama3_de_en_new_prompt_final.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt4/results_gpt4_de_en_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt/results_gpt_de_en_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gemma_newPrompt/results_gemma_de_en_final.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/majority_vote_result_de_en_narjes.csv'
majority_vote(csv_files, output_file)

In [None]:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/Llama3/results_llama3_en_de_new_prompt_final.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt4/results_gpt4_en_de_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt/results_gpt_en_de_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gemma_newPrompt/results_gemma_en_de_final.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/majority_vote_result_en_de_narjes.csv'
majority_vote(csv_files, output_file)

In [None]:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/Llama3/results_llama3_en_fr_new_prompt_final.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt4/results_gpt4_en_fr_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt/results_gpt_en_fr_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gemma_newPrompt/results_gemma_en_fr_final.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/majority_vote_result_en_fr_narjes.csv'
majority_vote(csv_files, output_file)

In [None]:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/Llama3/results_llama3_fr_en_new_prompt_final.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt4/results_gpt4_fr_en_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gpt/results_gpt_fr_en_prompt2.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/gemma_newPrompt/results_gemma_fr_en_final.csv']
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group1_results/Detection_task/majority_vote_result_fr_en_narjes.csv'
majority_vote(csv_files, output_file)

  result_df['label'] = result_df.iloc[:, 1:].mode(axis=1)[0]


## Translation Task Files with own best prompts

In [19]:
from sklearn.metrics import f1_score
import pandas as pd

def calculate_f1_score(model_name, csv_file_path):
    # Load dataset
    data = pd.read_csv(csv_file_path)

    # Extract true labels and predicted labels
    true_labels = data['label']
    predicted_labels = data['prediction']

    # Calculate F1 score
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    print(model_name)
    print("F1 Score:", f1)

# Example usage
llama3= calculate_f1_score("LLama3", "/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/detection_task_llama3/trial/results_llama3_prompt_narjes_se_v1_trial.csv")

gpt4 = calculate_f1_score("GPT4", "/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/detection_task_gpt/trial/results_gpt4_se_v1_trial.csv")

gpt3 = calculate_f1_score("GPT3.5", "/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/detection_task_gpt/trial/results_gpt_se_narjes_trial.csv")

gemma = calculate_f1_score("Gemma", "/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/detection_task_gemma/trial/results_gemma_se_vnarjes_trial.csv")

LLama3
F1 Score: 0.5574074074074074
GPT4
F1 Score: 0.8488721804511279
GPT3.5
F1 Score: 0.532
Gemma
F1 Score: 0.5808777429467085


In [22]:
import pandas as pd
import numpy as np

def weighted_majority_vote(csv_files, f1_scores, output_file):
    # Initialize an empty DataFrame to store the results
    result_df =  pd.DataFrame(columns=['id', 'label'])

    # Iterate over each CSV file and corresponding F1 score
    for csv_file, f1_score in zip(csv_files, f1_scores):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file)

        # Rename the 'label' column to differentiate between files
        df.rename(columns={'label': f'label_{csv_files.index(csv_file)}'}, inplace=True)

        # Merge the DataFrame with the result DataFrame
        result_df = pd.merge(result_df, df, on='id', how='outer')

    print(len(result_df))

    # Calculate weights based on F1 scores
    weights = [f1 / sum(f1_scores) for f1 in f1_scores]
    print(weights)

    # Initialize an empty DataFrame to store the weighted results
    weighted_result_df = pd.DataFrame()

    # Weight each model's predictions and aggregate
    for i, csv_file in enumerate(csv_files):
        label_col = f'label_{i}'
        weighted_result_df[label_col] = result_df[label_col].apply(lambda x: weights[i] if x == 'hyp1' else 0)


    # Compute the weighted majority vote
    result_df['hyp1_weight'] = weighted_result_df.sum(axis=1)
    print("len csv files ", len(csv_files))
    result_df['hyp2_weight'] = len(csv_files) - result_df['hyp1_weight']
    result_df['weighted_majority_vote'] = np.where(result_df['hyp1_weight'] > result_df['hyp2_weight'], 'hyp1', 'hyp2')

    # Write the result to a new CSV file
    result_df[['id', 'label']].to_csv(output_file, index=False)
    return weighted_result_df, result_df




**English**

In [12]:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_llama3_en_v1_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt4_en_v1_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt_en_narjes_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gemma_en_vnarjes.csv']


f1_scores = [0.8102941176470588, 1.0, 0.8132352941176471, 0.6685222672064777]
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/weighted_majority_vote_result_en_narjes.csv'
df, result_df = weighted_majority_vote(csv_files, f1_scores, output_file)

119
[0.24613651201863518, 0.30376193860739004, 0.2470299294851275, 0.20307161988884728]
len csv files  4


**Swedish**

In [24]:
csv_files = ['/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_llama3_prompt_narjes_se_v2_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt4_se_v1_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gpt_se_narjes_detection.csv',
             '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/final_gemma_se_vnarjes.csv']


f1_scores = [0.5574074074074074, 0.8488721804511279, 0.532, 0.5808777429467085]
output_file = '/content/drive/MyDrive/Master Project-Hallucination/Group2_results/detection_task/weighted_majority_vote_result_se_narjes.csv'
df, result_df = weighted_majority_vote(csv_files, f1_scores, output_file)

119
[0.2212674058071765, 0.3369667190178184, 0.211181728705268, 0.23058414646973718]
len csv files  4


In [25]:
len(df)

119

In [26]:
df.head()

Unnamed: 0,label_0,label_1,label_2,label_3
0,0.0,0.336967,0.0,0.0
1,0.221267,0.336967,0.211182,0.0
2,0.0,0.336967,0.211182,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
