In [None]:
import pandas as pd
file_path = 'data/dataset.csv'  # Adjusted path for the dataset
df = pd.read_csv(file_path)

# Get statistics for bias_type
bias_type_counts = df['bias_type'].value_counts()

# Display statistics
bias_type_counts


In [None]:
# Group by 'bias_type' and count 'type_category'
type_counts = df.groupby(['bias_type', 'type_category']).size().unstack(fill_value=0)

# Display the counts
type_counts


In [None]:
# Count unique genders and their frequencies
gender_counts = df['target_gender'].value_counts()

# Display the results
print("Number of genders represented:", gender_counts.size)
print("\nCounts of each gender:")
print(gender_counts)

In [None]:
# Find rows with NaN values in 'context_norwegian'
nan_rows = df[df['context_norwegian'].isna()]

# Print the index and corresponding 'context' column
for index, row in nan_rows.iterrows():
    print(f"Index: {index}, Original Context: {row['context']}")


In [None]:
import matplotlib.pyplot as plt

# Data
categories = ['Stereotype', 'Anti-Stereotype']
values = [70, 30]  # Replace with your percentages

# Create a bar chart
plt.bar(categories, values, color=['red', 'blue'])
plt.ylabel('Percentage')
plt.title('Model Choices: Stereotype vs Anti-Stereotype')
plt.show()


In [None]:
# Function to count words in a column, handling NaN values
def word_count(text):
    return len(str(text).split()) if pd.notna(text) else 0

# Apply word count to relevant columns
df["stereotype_word_count"] = df["stereotype"].apply(word_count)
df["anti_stereotype_word_count"] = df["anti_stereotype"].apply(word_count)
df["unrelated_word_count"] = df["unrelated"].apply(word_count)

# Group by bias_type, type_category, and word count, then count occurrences
stereotype_counts = df.groupby(["bias_type", "type_category", "stereotype_word_count"]).size().reset_index(name="stereotype_count")
anti_stereotype_counts = df.groupby(["bias_type", "type_category", "anti_stereotype_word_count"]).size().reset_index(name="anti_stereotype_count")
unrelated_counts = df.groupby(["bias_type", "type_category", "unrelated_word_count"]).size().reset_index(name="unrelated_count")

# Rename word count columns before merging to avoid conflicts
anti_stereotype_counts = anti_stereotype_counts.rename(columns={"anti_stereotype_word_count": "word_count"})
unrelated_counts = unrelated_counts.rename(columns={"unrelated_word_count": "word_count"})
stereotype_counts = stereotype_counts.rename(columns={"stereotype_word_count": "word_count"})

# Merge counts based on bias_type, type_category, and word_count
final_summary = (
    stereotype_counts
    .merge(anti_stereotype_counts, on=["bias_type", "type_category", "word_count"], how="outer")
    .merge(unrelated_counts, on=["bias_type", "type_category", "word_count"], how="outer")
    .fillna(0)  # Replace NaN with 0
)

# Convert counts to integers
final_summary[["stereotype_count", "anti_stereotype_count", "unrelated_count"]] = final_summary[["stereotype_count", "anti_stereotype_count", "unrelated_count"]].astype(int)

# Display the result
print(final_summary)