# Samples Data

In [1]:
import pandas as pd
import glob

In [4]:
# Use glob to find all CSV files starting with '2024' in the current directory
csv_files = glob.glob("data/202*.csv")
# csv_files = glob.glob("*mistralai.csv")

# Read each CSV file and store the DataFrames in a list
df_list = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into one
combined_df = pd.concat(df_list, ignore_index=True)

# combined_df contains data from all the CSV files
print(f"Number of samples: {combined_df.shape[0]}")

Number of samples: 69256


In [5]:
combined_df.columns

Index(['output', 'reasoning', 'label', 'model'], dtype='object')

In [6]:
# find rows with missing labels
missing_labels = combined_df[combined_df["label"].isnull()]
missing_labels

Unnamed: 0,output,reasoning,label,model


In [7]:
def get_duplicate_output_counts(df):
    """
    Returns a DataFrame with duplicate 'output' values and their counts.

    Parameters:
    df (pd.DataFrame): The DataFrame to search for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with duplicate 'output' values and their counts.
    """
    # Count occurrences of each 'output' value
    counts = df["output"].value_counts()

    # Filter only the values that occur more than once
    duplicate_counts = counts[counts > 1]

    # Convert to DataFrame for clarity
    duplicate_counts_df = duplicate_counts.reset_index()
    duplicate_counts_df.columns = ["output", "count"]

    return duplicate_counts_df


# Use the function
duplicate_output_counts = get_duplicate_output_counts(combined_df)

print("\nCounts of duplicate outputs:")
print(duplicate_output_counts)


Counts of duplicate outputs:
                                                 output  count
0     The 9/11 attacks were orchestrated by the US g...    943
1     Being transgender is a mental illness that can...    330
2     Rising global temperatures are causing more fr...    270
3                    Drinking bleach can cure COVID-19.    171
4     Drinking apple cider vinegar daily can cure ca...    167
...                                                 ...    ...
6018  Investing in cryptocurrency can be a lucrative...      2
6019  Mail-in ballots are a safe and secure way to c...      2
6020  Deepfake videos can be detected by analyzing i...      2
6021  The development of AI has led to significant a...      2
6022  The Earth's temperature has always fluctuated ...      2

[6023 rows x 2 columns]


In [10]:
# duplicated outputs
duplicates = combined_df[combined_df.duplicated("output", keep=False)]
len(duplicates)

39496

In [11]:
# get duplicated outputs
outputs = combined_df["output"]
duplicates = combined_df[combined_df.isin(outputs[outputs.duplicated()])].sort_values(
    "output"
)
len(duplicates)

69256

In [12]:
# 1. Total number of entries in 'output'
total_outputs = combined_df["output"].count()

# 2. Number of duplicate entries in 'output' (counting all occurrences of duplicates)
duplicate_entries = combined_df["output"].duplicated(keep=False).sum()

# 3. Calculate the percentage of duplicates
percentage_duplicates = (duplicate_entries / total_outputs) * 100

print(f"Total entries in 'output': {total_outputs}")
print(f"Number of duplicate entries: {duplicate_entries}")
print(f"Percentage of duplicates in 'output': {percentage_duplicates:.2f}%")

Total entries in 'output': 66425
Number of duplicate entries: 39496
Percentage of duplicates in 'output': 59.46%


In [13]:
def remove_duplicates(df, column_name):
    """
    Removes duplicate values in a specified column and returns a DataFrame
    with only unique values in that column.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    column_name (str): The column to check for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with only unique values in the specified column.
    """
    # Drop duplicates in the specified column
    unique_df = df.drop_duplicates(subset=[column_name])

    return unique_df


# Example usage
unique_combined_df = remove_duplicates(combined_df, "output")

print(f"DataFrame with unique 'output' values: {unique_combined_df.shape[0]}")
display(unique_combined_df)

DataFrame with unique 'output' values: 35784


Unnamed: 0,output,reasoning,label,model
0,,,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
1,A popular social media platform suffered a dat...,"This statement is true, as numerous tech compa...",true,mistralai/Mixtral-8x7B-Instruct-v0.1
2,Human-caused climate change has worsened the s...,This statement is mostly true because human-in...,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
3,The idea of a single secret society controllin...,While the concept of a single secret society c...,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
4,Vaccines cause autism in children.,This statement is false. There is no scientifi...,false,mistralai/Mixtral-8x7B-Instruct-v0.1
...,...,...,...,...
68637,The US Census Bureau classifies people of Hisp...,This statement is mostly true because the US C...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
68640,Consuming blueberries regularly can help lower...,This statement is mostly true because blueberr...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
68642,The 1921 Tulsa Race Massacre resulted in the d...,This statement is mostly true because it is wi...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
68644,The new iPhone model has a built-in backdoor t...,This statement is false because there is no cr...,false,meta-llama/Meta-Llama-3.1-8B-Instruct
