In [1]:
#install libraries
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tueplots

import sys
print(sys.executable)


/Users/madelinemiller/Desktop/data_literacy/geonews_femicide/source/.venv/bin/python


In [2]:
#set source and output paths
source_path = '../../data/'
csv_output_path = '../../data/processed/'
figure_output_path = '../../paper/figures/'

#upload raw query data
df_32_raw = pd.read_csv(f'{source_path}repository_queries/500000_32_homicide-female_DE.csv') 
#upload manually tagged articles with json data
df_tag = pd.read_csv(f'{source_path}processed/manual-tag_all_parsedson.csv') 
#upload keyword data
df_key = pd.read_csv(f'{source_path}manual_tag/femicide_keywords.csv')
#upload top 25 data
df_25 = pd.read_csv(f'{source_path}processed/7-14_22-24_26-27_29-32_2017-2023_top25.csv')

#filter to only one entry per NUTS
df_32_raw['NUTS'] = df_32_raw['NUTS'].fillna('').astype(str)
df_32 = df_32_raw.groupby('id').agg({
    'NUTS': lambda x: ', '.join(sorted(set(code for code in x if code.startswith('DE')))),
    'url': 'first',
    'hostname': 'first',
    'date': 'first',
    'cos_dist': 'first' # these values will all be the same
    }).reset_index()

In [None]:
#headline analysis
#how many headlines are duplicates from the same source?

#set to all lowercase

#remove leading/trailing spaces

#count duplicates: same headline, same source --> how many times are they duplicated?

#return csv

In [3]:
# Keyword Analysis: 
# For each keyword, get a count of how many relevant and irrelevant articles are associated with it

# Ensure the keyword column is lowercase for matching
df_key['keyword'] = df_key['keyword'].str.lower()
df_tag['found_keywords'] = df_tag['found_keywords'].str.lower()
df_tag['not_found_keywords'] = df_tag['not_found_keywords'].str.lower()

# Make sure keywords are strings
df_tag['found_keywords'] = df_tag['found_keywords'].astype(str)

# Filter rows where 'has_keyword_data' equals True (boolean)
df_tag_key = df_tag[df_tag['has_keyword_data'] == True].copy()

# Split by comma
df_tag_key['found_keywords_list'] = df_tag_key['found_keywords'].str.split(',')

# Remove extra whitespace and convert to lowercase
df_tag_key['found_keywords_list'] = df_tag_key['found_keywords_list'].apply(
    lambda x: [kw.strip().lower() for kw in x] if isinstance(x, list) else []
)

# Explode into multiple rows
df_tag_key = df_tag_key.explode('found_keywords_list')

# Remove empty keywords and NaN values
df_tag_key = df_tag_key[df_tag_key['found_keywords_list'].notna()]
df_tag_key = df_tag_key[df_tag_key['found_keywords_list'] != '']

print(f"Total rows after processing: {len(df_tag_key)}")

# Create summary: count of relevant and irrelevant articles per keyword
# Assuming you have a column that indicates relevance (adjust column name as needed)
# Common column names might be: 'relevant', 'is_relevant', 'woman_murdered', etc.

keyword_summary = df_tag_key.groupby('found_keywords_list').agg(
    total_articles=('id', 'nunique'),
    relevant_count=('woman_murdered', lambda x: (x == True).sum()),
    irrelevant_count=('woman_murdered', lambda x: (x == False).sum())
).reset_index()

keyword_summary.columns = ['keyword', 'total_articles', 'relevant_count', 'irrelevant_count']

# Sort by total articles descending
keyword_summary = keyword_summary.sort_values('total_articles', ascending=False)

print(f"\nKeyword Summary:")
print(keyword_summary)

keyword_summary.to_csv(f'{csv_output_path}keyword_summary.csv', index=False)

Total rows after processing: 5516

Keyword Summary:
             keyword  total_articles  relevant_count  irrelevant_count
38               tat             525             329               188
4                ehe             476             295               176
29              mord             354             223               123
26            gewalt             320             186               129
43               tot             309             196               109
25           getötet             302             208                92
34             opfer             289             166               120
40               tod             289             211                75
35           partner             221             111               106
51        verdächtig             220             166                54
52       verdächtige             199             150                49
30             morde             192             105                84
47            tötung     

In [None]:
# Calculate relevance metrics for each keyword
keyword_summary['relevance_rate'] = (
    keyword_summary['relevant_count'] / keyword_summary['total_articles'] * 100
)

# Calculate irrelevance rate
keyword_summary['irrelevance_rate'] = (
    keyword_summary['irrelevant_count'] / keyword_summary['total_articles'] * 100
)

# Filter keywords with at least 20 articles for more reliable statistics
keyword_filtered = keyword_summary[keyword_summary['total_articles'] >= 20].copy()

# Sort by relevance rate
keyword_filtered = keyword_filtered.sort_values('relevance_rate', ascending=False)

print("Top 10 Keywords - Highest Relevance Rate:")
print(keyword_filtered[['keyword', 'total_articles', 'relevance_rate', 'relevant_count', 'irrelevant_count']].head(10))
print("\n" + "="*80 + "\n")

print("Top 10 Keywords - Lowest Relevance Rate (Predictors of Irrelevance):")
print(keyword_filtered[['keyword', 'total_articles', 'relevance_rate', 'relevant_count', 'irrelevant_count']].tail(10))

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Top 20 keywords by relevance rate (min 20 articles)
ax1 = axes[0, 0]
top_relevant = keyword_filtered.head(20)
ax1.barh(range(len(top_relevant)), top_relevant['relevance_rate'], color='steelblue')
ax1.set_yticks(range(len(top_relevant)))
ax1.set_yticklabels(top_relevant['keyword'])
ax1.set_xlabel('Relevance Rate (%)')
ax1.set_title('Top 20 Keywords by Relevance Rate\n(min 20 articles)', fontsize=12, fontweight='bold')
ax1.axvline(x=50, color='red', linestyle='--', alpha=0.5, label='50% threshold')
ax1.legend()
ax1.invert_yaxis()

# 2. Scatter plot: Total articles vs Relevance rate
ax2 = axes[0, 1]
scatter = ax2.scatter(keyword_filtered['total_articles'], 
                     keyword_filtered['relevance_rate'],
                     s=keyword_filtered['total_articles']*2,
                     alpha=0.6,
                     c=keyword_filtered['relevance_rate'],
                     cmap='RdYlGn')
ax2.set_xlabel('Total Articles')
ax2.set_ylabel('Relevance Rate (%)')
ax2.set_title('Keyword Frequency vs Relevance Rate', fontsize=12, fontweight='bold')
ax2.axhline(y=50, color='red', linestyle='--', alpha=0.5)
plt.colorbar(scatter, ax=ax2, label='Relevance %')

# Annotate key outliers
for idx, row in keyword_filtered.iterrows():
    if row['total_articles'] > 200 or row['relevance_rate'] > 75 or row['relevance_rate'] < 40:
        ax2.annotate(row['keyword'], 
                    (row['total_articles'], row['relevance_rate']),
                    fontsize=8, alpha=0.7)

# 3. Stacked bar chart - Relevant vs Irrelevant (top 20 by volume)
ax3 = axes[1, 0]
top_volume = keyword_filtered.nlargest(20, 'total_articles')
x_pos = range(len(top_volume))
ax3.bar(x_pos, top_volume['relevant_count'], label='Relevant', color='green', alpha=0.7)
ax3.bar(x_pos, top_volume['irrelevant_count'], 
       bottom=top_volume['relevant_count'], label='Irrelevant', color='red', alpha=0.7)
ax3.set_xticks(x_pos)
ax3.set_xticklabels(top_volume['keyword'], rotation=45, ha='right')
ax3.set_ylabel('Number of Articles')
ax3.set_title('Top 20 Keywords by Volume\n(Relevant vs Irrelevant)', fontsize=12, fontweight='bold')
ax3.legend()

# 4. Distribution of relevance rates
ax4 = axes[1, 1]
ax4.hist(keyword_filtered['relevance_rate'], bins=20, color='steelblue', alpha=0.7, edgecolor='black')
ax4.axvline(x=keyword_filtered['relevance_rate'].mean(), 
           color='red', linestyle='--', linewidth=2, label=f'Mean: {keyword_filtered["relevance_rate"].mean():.1f}%')
ax4.axvline(x=keyword_filtered['relevance_rate'].median(), 
           color='orange', linestyle='--', linewidth=2, label=f'Median: {keyword_filtered["relevance_rate"].median():.1f}%')
ax4.set_xlabel('Relevance Rate (%)')
ax4.set_ylabel('Number of Keywords')
ax4.set_title('Distribution of Keyword Relevance Rates', fontsize=12, fontweight='bold')
ax4.legend()

plt.tight_layout()
plt.show()

# Statistical summary
print("\n" + "="*80)
print("STATISTICAL SUMMARY (Keywords with 20+ articles)")
print("="*80)
print(f"Mean Relevance Rate: {keyword_filtered['relevance_rate'].mean():.2f}%")
print(f"Median Relevance Rate: {keyword_filtered['relevance_rate'].median():.2f}%")
print(f"Std Dev: {keyword_filtered['relevance_rate'].std():.2f}%")
print(f"\nKeywords above 70% relevance (strong predictors): {(keyword_filtered['relevance_rate'] > 70).sum()}")
print(f"Keywords below 50% relevance (weak/negative predictors): {(keyword_filtered['relevance_rate'] < 50).sum()}")