# Samples Data

In [1]:
import pandas as pd
import glob

In [None]:
# Use glob to find all CSV files starting with '2024' in the current directory
csv_files = glob.glob("2024*.csv")
# csv_files = glob.glob("*mistralai.csv")

# Read each CSV file and store the DataFrames in a list
df_list = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into one
combined_df = pd.concat(df_list, ignore_index=True)

# combined_df contains data from all the CSV files
print(f"Number of samples: {combined_df.shape[0]}")

Number of samples: 48192


In [3]:
combined_df.columns

Index(['output', 'reasoning', 'label', 'model'], dtype='object')

In [4]:
# find rows with missing labels
missing_labels = combined_df[combined_df["label"].isnull()]
missing_labels

Unnamed: 0,output,reasoning,label,model


In [5]:
def get_duplicate_output_counts(df):
    """
    Returns a DataFrame with duplicate 'output' values and their counts.

    Parameters:
    df (pd.DataFrame): The DataFrame to search for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with duplicate 'output' values and their counts.
    """
    # Count occurrences of each 'output' value
    counts = df["output"].value_counts()

    # Filter only the values that occur more than once
    duplicate_counts = counts[counts > 1]

    # Convert to DataFrame for clarity
    duplicate_counts_df = duplicate_counts.reset_index()
    duplicate_counts_df.columns = ["output", "count"]

    return duplicate_counts_df


# Use the function
duplicate_output_counts = get_duplicate_output_counts(combined_df)

print("\nCounts of duplicate outputs:")
print(duplicate_output_counts)


Counts of duplicate outputs:
                                                 output  count
0     The 9/11 attacks were orchestrated by the US g...    827
1     Being transgender is a mental illness that can...    255
2     Rising global temperatures are causing more fr...    220
3     Drinking apple cider vinegar daily can cure ca...    147
4                    Drinking bleach can cure COVID-19.    146
...                                                 ...    ...
4926  The COVID-19 virus was first identified in Wuh...      2
4927  Deepfake videos can be easily detected using A...      2
4928  The COVID-19 vaccine has been proven to be saf...      2
4929  There is no credible evidence to support the e...      2
4930  A diet rich in fruits, vegetables, and whole g...      2

[4931 rows x 2 columns]


In [6]:
# duplicated outputs
duplicates = combined_df[combined_df.duplicated("output", keep=False)]
len(duplicates)

27527

In [7]:
# get duplicated outputs
outputs = combined_df["output"]
duplicates = combined_df[combined_df.isin(outputs[outputs.duplicated()])].sort_values(
    "output"
)
len(duplicates)

48192

In [8]:
# 1. Total number of entries in 'output'
total_outputs = combined_df["output"].count()

# 2. Number of duplicate entries in 'output' (counting all occurrences of duplicates)
duplicate_entries = combined_df["output"].duplicated(keep=False).sum()

# 3. Calculate the percentage of duplicates
percentage_duplicates = (duplicate_entries / total_outputs) * 100

print(f"Total entries in 'output': {total_outputs}")
print(f"Number of duplicate entries: {duplicate_entries}")
print(f"Percentage of duplicates in 'output': {percentage_duplicates:.2f}%")

Total entries in 'output': 46353
Number of duplicate entries: 27527
Percentage of duplicates in 'output': 59.39%


In [9]:
def remove_duplicates(df, column_name):
    """
    Removes duplicate values in a specified column and returns a DataFrame
    with only unique values in that column.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    column_name (str): The column to check for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with only unique values in the specified column.
    """
    # Drop duplicates in the specified column
    unique_df = df.drop_duplicates(subset=[column_name])

    return unique_df


# Example usage
unique_combined_df = remove_duplicates(combined_df, "output")

print(f"DataFrame with unique 'output' values: {unique_combined_df.shape[0]}")
display(unique_combined_df)

DataFrame with unique 'output' values: 25597


Unnamed: 0,output,reasoning,label,model
0,The IQ of different racial groups is significa...,This statement is false because the idea that ...,false,meta-llama/Meta-Llama-3.1-8B-Instruct
1,Acupuncture can help alleviate chronic pain by...,This contains some truth because some studies ...,partially true,meta-llama/Meta-Llama-3.1-8B-Instruct
2,,,true,meta-llama/Meta-Llama-3.1-8B-Instruct
3,The US dollar is being replaced by a new digit...,"This statement is false, because there is no c...",false,meta-llama/Meta-Llama-3.1-8B-Instruct
4,Chiropractors can cure cancer by manipulating ...,This statement is false because chiropractic c...,false,meta-llama/Meta-Llama-3.1-8B-Instruct
...,...,...,...,...
48184,The 2008 global financial crisis was caused by...,This statement is mostly true because the 2008...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
48187,The 2020 US presidential election was marred b...,This statement is mostly true because while th...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
48188,The concept of artificial general intelligence...,This statement is true and factual information...,true,meta-llama/Meta-Llama-3.1-8B-Instruct
48189,The United States Census Bureau recognizes and...,"This is true and factual information. In 2020,...",true,meta-llama/Meta-Llama-3.1-8B-Instruct
