In [None]:
# Welcome to NOTEBOOK 3: Reasoning of a Note dataset

# This notebook will show the distribution of reasoning for a proposed Note in the dataset.

# It will compare Notes citing a European fact-checker against a Notes citing other sources.

import pandas as pd

df = pd.read_csv(# # input file path to Notes dataset clean_4)

In [None]:
# Filter notes citing European fact-checkers
european_fact_checker_notes = df[(df['noteFromEFCSN'] == 1) | (df['noteFromEDMO'] == 1)]

# Filter notes citing other sources
other_source_notes = df[(df['noteFromEFCSN'] == 0) & (df['noteFromEDMO'] == 0)]

# Select columns with reasons for proposing a note (indices 3 to 14)
reason_columns = df.columns[3:15]

# Calculate the distribution of reasons for European fact-checker notes
european_distribution = european_fact_checker_notes[reason_columns].sum()

# Calculate the distribution of reasons for other source notes
other_distribution = other_source_notes[reason_columns].sum()

# Normalize the distributions to percentages
european_distribution_percentage = (european_distribution / european_distribution.sum()) * 100
other_distribution_percentage = (other_distribution / other_distribution.sum()) * 100

# Display the distributions
print("Distribution of reasons for European fact-checker notes:")
print(european_distribution_percentage)

print("\nDistribution of reasons for other source notes:")
print(other_distribution_percentage)

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Combine the two distributions into a DataFrame for easier plotting
comparison_df = pd.DataFrame({
    'European Fact-Checker Notes': european_distribution_percentage,
    'Other Source Notes': other_distribution_percentage
})

# Normalize the data for better visualization (optional)
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(comparison_df)
comparison_df_normalized = pd.DataFrame(normalized_data, columns=comparison_df.columns, index=comparison_df.index)

# Plot the data
x = np.arange(len(comparison_df.index))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width/2, comparison_df['European Fact-Checker Notes'], width, label='European Fact-Checker Notes')
bars2 = ax.bar(x + width/2, comparison_df['Other Source Notes'], width, label='Other Source Notes')

# Add labels, title, and legend
ax.set_xlabel('Reasons')
ax.set_ylabel('Percentage')
ax.set_title('Comparison of Reasons for Notes')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df.index, rotation=45, ha='right')
ax.legend()

# Add value annotations on top of the bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

plt.tight_layout()
plt.show()