# Samples Data

In [1]:
import pandas as pd
import glob

In [2]:
# Use glob to find all CSV files starting with '2024' in the current directory
csv_files = glob.glob("data/generated_data/202*.csv")
# csv_files = glob.glob("*mistralai.csv")

# Read each CSV file and store the DataFrames in a list
df_list = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames into one
combined_df = pd.concat(df_list, ignore_index=True)

# combined_df contains data from all the CSV files
print(f"Number of samples: {combined_df.shape[0]}")

Number of samples: 77256


In [3]:
combined_df.columns

Index(['output', 'reasoning', 'label', 'model'], dtype='object')

In [4]:
# find rows with missing labels
missing_labels = combined_df[combined_df["label"].isnull()]
missing_labels

Unnamed: 0,output,reasoning,label,model


In [5]:
def get_duplicate_output_counts(df):
    """
    Returns a DataFrame with duplicate 'output' values and their counts.

    Parameters:
    df (pd.DataFrame): The DataFrame to search for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with duplicate 'output' values and their counts.
    """
    # Count occurrences of each 'output' value
    counts = df["output"].value_counts()

    # Filter only the values that occur more than once
    duplicate_counts = counts[counts > 1]

    # Convert to DataFrame for clarity
    duplicate_counts_df = duplicate_counts.reset_index()
    duplicate_counts_df.columns = ["output", "count"]

    return duplicate_counts_df


# Use the function
duplicate_output_counts = get_duplicate_output_counts(combined_df)

print("\nCounts of duplicate outputs:")
print(duplicate_output_counts)


Counts of duplicate outputs:
                                                 output  count
0     The 9/11 attacks were orchestrated by the US g...    944
1     Being transgender is a mental illness that can...    349
2     Rising global temperatures are causing more fr...    270
3                    Drinking bleach can cure COVID-19.    171
4     Drinking apple cider vinegar daily can cure ca...    167
...                                                 ...    ...
6433  Eating a Mediterranean-style diet rich in frui...      2
6434  AI systems can replace human emotions and deve...      2
6435  The Federal Reserve is an independent agency t...      2
6436  The United States has a 100% voter turnout rat...      2
6437  NASA's Curiosity rover has been exploring Mars...      2

[6438 rows x 2 columns]


In [6]:
# duplicated outputs
duplicates = combined_df[combined_df.duplicated("output", keep=False)]
len(duplicates)

41800

In [7]:
# get duplicated outputs
outputs = combined_df["output"]
duplicates = combined_df[combined_df.isin(outputs[outputs.duplicated()])].sort_values(
    "output"
)
len(duplicates)

77256

In [8]:
# 1. Total number of entries in 'output'
total_outputs = combined_df["output"].count()

# 2. Number of duplicate entries in 'output' (counting all occurrences of duplicates)
duplicate_entries = combined_df["output"].duplicated(keep=False).sum()

# 3. Calculate the percentage of duplicates
percentage_duplicates = (duplicate_entries / total_outputs) * 100

print(f"Total entries in 'output': {total_outputs}")
print(f"Number of duplicate entries: {duplicate_entries}")
print(f"Percentage of duplicates in 'output': {percentage_duplicates:.2f}%")

Total entries in 'output': 73630
Number of duplicate entries: 41800
Percentage of duplicates in 'output': 56.77%


In [9]:
def remove_duplicates(df, column_name):
    """
    Removes duplicate values in a specified column and returns a DataFrame
    with only unique values in that column.

    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    column_name (str): The column to check for duplicates.

    Returns:
    pd.DataFrame: A DataFrame with only unique values in the specified column.
    """
    # Drop duplicates in the specified column
    unique_df = df.drop_duplicates(subset=[column_name])
    # Remove NaN values in the specified column
    unique_df = unique_df[unique_df[column_name].notna()]

    return unique_df


# Example usage
unique_combined_df = remove_duplicates(combined_df, "output")

print(f"DataFrame with unique 'output' values: {unique_combined_df.shape[0]}")
display(unique_combined_df)

DataFrame with unique 'output' values: 41894


Unnamed: 0,output,reasoning,label,model
1,A popular social media platform suffered a dat...,"This statement is true, as numerous tech compa...",true,mistralai/Mixtral-8x7B-Instruct-v0.1
2,Human-caused climate change has worsened the s...,This statement is mostly true because human-in...,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
3,The idea of a single secret society controllin...,While the concept of a single secret society c...,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
4,Vaccines cause autism in children.,This statement is false. There is no scientifi...,false,mistralai/Mixtral-8x7B-Instruct-v0.1
5,The Earth is flat and governments are hiding t...,This statement is false because the Earth is a...,false,mistralai/Mixtral-8x7B-Instruct-v0.1
...,...,...,...,...
76637,The US Census Bureau classifies people of Hisp...,This statement is mostly true because the US C...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
76640,Consuming blueberries regularly can help lower...,This statement is mostly true because blueberr...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
76642,The 1921 Tulsa Race Massacre resulted in the d...,This statement is mostly true because it is wi...,mostly true,meta-llama/Meta-Llama-3.1-8B-Instruct
76644,The new iPhone model has a built-in backdoor t...,This statement is false because there is no cr...,false,meta-llama/Meta-Llama-3.1-8B-Instruct


In [10]:
# select shortest output
shortest_output = unique_combined_df.loc[
    unique_combined_df["output"].str.len().idxmin()
]
shortest_output["output"], shortest_output["label"]

('Homeopathy can cure cancer.', 'false')

In [11]:
# longest output
longest_output = unique_combined_df.loc[unique_combined_df["output"].str.len().idxmax()]
longest_output["output"], longest_output["label"]

('The COVID-19 virus is believed to have originated from an animal source, possibly bats, and was first detected in Wuhan, China in December 2019. Lockdowns and social distancing measures have been implemented globally to slow the spread of the virus, and various treatments such as remdesivir and dexamethasone have been approved for use in severe cases.  Modified Outputs:  - LABEL: mostly true OUTPUT: The COVID-19 virus is believed to have originated from an animal source, possibly bats, but some scientists suggest it may have been created in a lab. Lockdowns and social distancing measures have been effective in slowing the spread of the virus in most countries, and remdesivir has been shown to reduce hospitalization rates in severe cases.',
 'mostly true')

In [12]:
# write combined unique data to CSV
file_path = "data/final_data/generated_combined_unique_outputs.csv"
unique_combined_df.to_csv(file_path, index=False)

In [13]:
# read combined unique data
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,output,reasoning,label,model
0,A popular social media platform suffered a dat...,"This statement is true, as numerous tech compa...",true,mistralai/Mixtral-8x7B-Instruct-v0.1
1,Human-caused climate change has worsened the s...,This statement is mostly true because human-in...,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
2,The idea of a single secret society controllin...,While the concept of a single secret society c...,mostly true,mistralai/Mixtral-8x7B-Instruct-v0.1
3,Vaccines cause autism in children.,This statement is false. There is no scientifi...,false,mistralai/Mixtral-8x7B-Instruct-v0.1
4,The Earth is flat and governments are hiding t...,This statement is false because the Earth is a...,false,mistralai/Mixtral-8x7B-Instruct-v0.1


In [14]:
# number of rows generated by Llama
llama_df = df[df["model"].str.contains("llama", case=False)]
llama_df.shape

(41636, 4)

In [15]:
# number of rows generated by Mistral
mistral_df = df[df["model"].str.contains("mistral", case=False)]
mistral_df.shape

(258, 4)

### train/validation/test split

In [27]:
from sklearn.model_selection import StratifiedShuffleSplit


# split data into train/validation/test sets
stratify_col = df["label"]

# first split: train (80%) and test (20%)
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_val_idx, test_idx in split1.split(df, stratify_col):
    train_val_df = df.iloc[train_val_idx]
    test_df = df.iloc[test_idx]

# create copies of the data
train_val_df = df.iloc[train_val_idx].copy()
test_df = df.iloc[test_idx].copy()

# second split: train (80% of 80%) and validation (20% of 80%) => 64% train, 16% validation
split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, val_idx in split2.split(train_val_df, stratify_col.iloc[train_val_idx]):
    train_df = train_val_df.iloc[train_idx]
    val_df = train_val_df.iloc[val_idx]

# create copies of the data
train_df = train_val_df.iloc[train_idx].copy()
val_df = train_val_df.iloc[val_idx].copy()

# rename "output" column to "text"
train_df.rename(columns={"output": "text"}, inplace=True)
val_df.rename(columns={"output": "text"}, inplace=True)
test_df.rename(columns={"output": "text"}, inplace=True)

# create a label mapping
label_to_id = {label: idx for idx, label in enumerate(sorted(df["label"].unique()))}
print("Label to ID mapping:", label_to_id)

# apply to each split
train_df["label"] = train_df["label"].map(label_to_id)
val_df["label"] = val_df["label"].map(label_to_id)
test_df["label"] = test_df["label"].map(label_to_id)


# print splits
print(f"Train size: {train_df.shape[0]}")
print(f"Validation size: {val_df.shape[0]}")
print(f"Test size: {test_df.shape[0]}")

# save the splits to CSV files
train_df.to_csv("data/train_data/train.csv", index=False)
val_df.to_csv("data/val_data/val.csv", index=False)
test_df.to_csv("data/test_data/test.csv", index=False)

Label to ID mapping: {'false': 0, 'mostly true': 1, 'partially true': 2, 'true': 3}
Train size: 26812
Validation size: 6703
Test size: 8379


In [28]:
train_df = pd.read_csv("data/train_data/train.csv")
train_df.head()

Unnamed: 0,text,reasoning,label,model
0,Solar panels can generate electricity from sun...,This is true and factual information. Solar pa...,3,meta-llama/Meta-Llama-3.1-8B-Instruct
1,Vaccinating children against COVID-19 can redu...,This statement is mostly true because numerous...,1,meta-llama/Meta-Llama-3.1-8B-Instruct
2,The US government prints money to secretly fun...,This statement is false because there is no cr...,0,meta-llama/Meta-Llama-3.1-8B-Instruct
3,Scientists have discovered a new medication th...,This is false because there is no scientific e...,0,meta-llama/Meta-Llama-3.1-8B-Instruct
4,Scientific studies have consistently shown tha...,This statement is true and factual information...,3,meta-llama/Meta-Llama-3.1-8B-Instruct
