# Samples Data

In [2]:
import pandas as pd
import glob

In [19]:
# Use glob to find all CSV files starting with '2024' in the current directory
csv_files = glob.glob('2024*.csv')

# Read each CSV file and store the DataFrames in a list
df_list = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into one
combined_df = pd.concat(df_list, ignore_index=True)

# combined_df contains data from all the CSV files
print(f"Number of samples: {combined_df.shape[0]}")

Number of samples: 47187


In [20]:
combined_df.columns

Index(['output', 'reasoning', 'label', 'model'], dtype='object')

In [21]:
# find rows with missing labels
missing_labels = combined_df[combined_df['label'].isnull()]
missing_labels

Unnamed: 0,output,reasoning,label,model


In [22]:
# Function to get counts of duplicate outputs
def get_duplicate_output_counts(df):
    """
    Returns a Series with 'output' values that are duplicated and their counts.

    Parameters:
    df (pd.DataFrame): The DataFrame to search for duplicates.

    Returns:
    pd.Series: Counts of duplicate 'output' values.
    """
    counts = df['output'].value_counts()
    duplicate_counts = counts[counts > 1]
    return duplicate_counts

# Use the function
duplicate_output_counts = get_duplicate_output_counts(combined_df)

print("\nCounts of duplicate outputs:")
print(duplicate_output_counts)


Counts of duplicate outputs:
output
The 9/11 attacks were orchestrated by the US government to justify wars in the Middle East.                                                                                          827
Being transgender is a mental illness that can be cured through therapy.                                                                                                             255
Rising global temperatures are causing more frequent and severe heatwaves.                                                                                                           220
Drinking apple cider vinegar daily can cure cancer.                                                                                                                                  147
Drinking bleach can cure COVID-19.                                                                                                                                                   146
                                      

In [26]:
# duplicated outputs
duplicates = combined_df[combined_df.duplicated('output', keep=False)]
len(duplicates)

26523

In [29]:
# get duplicated outputs
outputs = combined_df['output']
duplicates = combined_df[combined_df.isin(outputs[outputs.duplicated()])].sort_values("output")
len(duplicates)

47187

In [30]:
# 1. Total number of entries in 'output'
total_outputs = combined_df['output'].count()

# 2. Number of duplicate entries in 'output' (counting all occurrences of duplicates)
duplicate_entries = combined_df['output'].duplicated(keep=False).sum()

# 3. Calculate the percentage of duplicates
percentage_duplicates = (duplicate_entries / total_outputs) * 100

print(f"Total entries in 'output': {total_outputs}")
print(f"Number of duplicate entries: {duplicate_entries}")
print(f"Percentage of duplicates in 'output': {percentage_duplicates:.2f}%")

Total entries in 'output': 45374
Number of duplicate entries: 26523
Percentage of duplicates in 'output': 58.45%
