## DATA EXPLORATION

In [1]:
#imports
import pandas as pd

In [2]:
alerts_cleaned_df = pd.read_csv('../data/raw/mapmf_alerts_cleaned.csv')
dict_df = pd.read_csv('../data/raw/mapmf_data_dictionary.csv')

In [3]:
alerts_cleaned_df.head()

Unnamed: 0,id,title,date,year,month,day_of_week,country,region_level_1,region_level_2,primary_incident_type,...,type_of_incident,primary_source,source_of_incident,context_of_incident,region_names,content,content_length,published_at_date,_geo_lat,_geo_lng
0,34388,Journalist Emilia Șercan targeted with smear c...,2026-01-12,2026,1,Monday,Romania,Romania,Bucharest,Intimidation/threatening,...,Intimidation/threatening | Verbal attack | Dis...,Media outlet,Media outlet | Legislation: government/public ...,Online/digital,EU Member States | Romania | Bucharest,"On 12 January 2026, investigative journalist E...",1740,2026-01-15,44.436141,26.102684
1,34382,OCCRP website targeted with DDoS attack,2026-01-12,2026,1,Monday,Netherlands,Netherlands,North Holland,Hacking/DDoS,...,Hacking/DDoS | Attack to property | Blocked di...,Unknown source of incident,Unknown source of incident,Online/digital,EU Member States | Netherlands | North Holland,"On 13 January 2026, the Organised Crime and Co...",1019,2026-01-14,52.37308,4.892453
2,34358,Slovak political commentator Peter Schutz viol...,2026-01-10,2026,1,Saturday,Slovakia,Slovakia,Eastern Slovakia,Injury (physical assault resulting in injury),...,Injury (physical assault resulting in injury) ...,Unknown source of incident,Unknown source of incident,Public place/street,EU Member States | Slovakia | Eastern Slovakia...,"On 10 January 2025, Slovak political commentat...",2401,2026-01-12,48.717227,21.249677
3,34392,Ici Nord Sylvain Charley cyber harassed,2026-01-08,2026,1,Thursday,France,France,Hauts-de-France,"Harassment, insult, bullying",...,"Harassment, insult, bullying | Verbal attack",Private individual(s),Private individual(s),Online/digital,EU Member States | France | Hauts-de-France | ...,"On 8 January 2026, Sylvain Charley, a journali...",1045,2026-01-16,50.636565,3.063528
4,34356,Police opens investigation into alleged survei...,2026-01-08,2026,1,Thursday,Ukraine,Ukraine,Київська міська громада,Surveillance and interception of journalistic ...,...,Surveillance and interception of journalistic ...,Private individual(s),Private individual(s),Online/digital,EU candidate countries | Ukraine | Київська мі...,"On 8 January 2026, Ukrainian authorities repor...",1865,2026-01-12,50.450034,30.524136


In [4]:
alerts_cleaned_df.shape

(11305, 21)

In [5]:
alerts_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11305 entries, 0 to 11304
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     11305 non-null  int64  
 1   title                  11305 non-null  object 
 2   date                   11305 non-null  object 
 3   year                   11305 non-null  int64  
 4   month                  11305 non-null  int64  
 5   day_of_week            11305 non-null  object 
 6   country                11305 non-null  object 
 7   region_level_1         11305 non-null  object 
 8   region_level_2         11174 non-null  object 
 9   primary_incident_type  11305 non-null  object 
 10  incident_type_count    11305 non-null  int64  
 11  type_of_incident       11305 non-null  object 
 12  primary_source         11305 non-null  object 
 13  source_of_incident     11305 non-null  object 
 14  context_of_incident    11305 non-null  object 
 15  re

In [7]:
# Basic counts
print(f"Total incidents: {len(alerts_cleaned_df)}")
print(f"Date range: {alerts_cleaned_df['year'].min()} - {alerts_cleaned_df['year'].max()}")
print(f"Countries covered: {alerts_cleaned_df['country'].nunique()}")

# Country distribution (top 15)
print("\n--- Top 15 Countries by Incidents ---")
print(alerts_cleaned_df['country'].value_counts().head(15))

# Incident types
print("\n--- Primary Incident Types ---")
print(alerts_cleaned_df['primary_incident_type'].value_counts())

# Perpetrator types (who's attacking journalists?)
print("\n--- Primary Source (Perpetrators) ---")
print(alerts_cleaned_df['primary_source'].value_counts())

# Context of incidents
print("\n--- Context of Incidents ---")
print(alerts_cleaned_df['context_of_incident'].value_counts())

# Incidents over time
print("\n--- Incidents by Year ---")
print(alerts_cleaned_df['year'].value_counts().sort_index())

# Missing data check
print("\n--- Missing Values ---")
print(alerts_cleaned_df.isnull().sum())

Total incidents: 11305
Date range: 2014 - 2026
Countries covered: 45

--- Top 15 Countries by Incidents ---
country
Turkey                    1507
Ukraine                    995
Germany                    989
Italy                      869
France                     709
Serbia                     632
Russia                     582
Hungary                    456
Spain                      442
Poland                     366
Belarus                    291
Georgia                    283
Croatia                    251
Greece                     240
Bosnia and Herzegovina     224
Name: count, dtype: int64

--- Primary Incident Types ---
primary_incident_type
Intimidation/threatening                                                                           1655
Harassment, insult, bullying                                                                       1019
Arrest/detention/imprisonment                                                                       930
Discredit                  

In [8]:
# 1. Perpetrator type by country (state vs non-state)
# First, create a simplified perpetrator category
def categorize_perpetrator(source):
    state_actors = ['Legislation: government/public official(s)', 'Police/state security', 
                    'Judiciary: court/judicial', 'Military/armed forces', 
                    'Public entity or authority (like tax or health department)',
                    'Media regulatory authority']
    if source in state_actors:
        return 'State actor'
    elif source in ['Private individual(s)', 'Criminal organisation', 'Private security']:
        return 'Non-state actor'
    elif source in ['Employer/publisher/colleague(s)', 'Corporation/company', 'Media outlet']:
        return 'Media/corporate'
    elif source == 'Unknown source of incident':
        return 'Unknown'
    else:
        return 'Other'

alerts_cleaned_df['perpetrator_category'] = alerts_cleaned_df['primary_source'].apply(categorize_perpetrator)

# Cross-tab: country vs perpetrator type (top 10 countries)
top_countries = alerts_cleaned_df['country'].value_counts().head(10).index
perp_by_country = pd.crosstab(
    alerts_cleaned_df[alerts_cleaned_df['country'].isin(top_countries)]['country'], 
    alerts_cleaned_df[alerts_cleaned_df['country'].isin(top_countries)]['perpetrator_category'],
    normalize='index'  # percentage within each country
).round(3) * 100

print("--- Perpetrator Type by Country (%) ---")
print(perp_by_country.sort_values('State actor', ascending=False))

--- Perpetrator Type by Country (%) ---
perpetrator_category  Media/corporate  Non-state actor  Other  State actor  \
country                                                                      
Turkey                            4.4              7.4    4.4         79.7   
Hungary                          22.1              8.3    3.1         58.6   
Russia                           12.2              8.8    4.1         55.8   
Poland                           21.0              8.2    4.9         55.2   
Ukraine                           3.6             12.0    5.5         53.5   
France                           15.4             27.5    5.1         38.8   
Spain                            19.5             21.3    9.7         33.9   
Italy                             9.6             36.7    6.6         29.9   
Serbia                           10.4             35.1    4.6         28.2   
Germany                           3.7             59.0    6.1         23.3   

perpetrator_category  U

In [9]:
# 2. Severity by country
# Create severity scale
def categorize_severity(incident_type):
    severe = ['Death (physical assault resulting in death)', 'Abduction/kidnapping', 
              'Sexual assault', 'Injury (physical assault resulting in injury)']
    moderate = ['Physical assault', 'Without injury (physical assault not resulting in injury)',
                'Arrest/detention/imprisonment', 'Raid', 'Sexual harassment']
    if incident_type in severe:
        return 'Severe'
    elif incident_type in moderate:
        return 'Moderate'
    else:
        return 'Low/Legal/Other'

alerts_cleaned_df['severity'] = alerts_cleaned_df['primary_incident_type'].apply(categorize_severity)

# Severity by country
severity_by_country = pd.crosstab(
    alerts_cleaned_df[alerts_cleaned_df['country'].isin(top_countries)]['country'],
    alerts_cleaned_df[alerts_cleaned_df['country'].isin(top_countries)]['severity'],
    normalize='index'
).round(3) * 100

print("\n--- Severity by Country (%) ---")
print(severity_by_country.sort_values('Severe', ascending=False))


--- Severity by Country (%) ---
severity  Low/Legal/Other  Moderate  Severe
country                                    
France               76.7      12.7    10.6
Ukraine              70.5      19.5    10.1
Germany              63.6      28.0     8.4
Russia               63.6      29.2     7.2
Spain                76.5      17.4     6.1
Italy                81.1      13.1     5.8
Serbia               79.0      16.1     4.9
Turkey               55.0      40.3     4.6
Poland               88.5       9.3     2.2
Hungary              94.1       5.0     0.9


In [10]:
# 3. Turkey and Ukraine deep dive
print("\n--- Turkey: Perpetrator Breakdown ---")
print(alerts_cleaned_df[alerts_cleaned_df['country'] == 'Turkey']['primary_source'].value_counts())

print("\n--- Ukraine: Perpetrator Breakdown ---")
print(alerts_cleaned_df[alerts_cleaned_df['country'] == 'Ukraine']['primary_source'].value_counts())

print("\n--- Turkey: Context of Incidents ---")
print(alerts_cleaned_df[alerts_cleaned_df['country'] == 'Turkey']['context_of_incident'].value_counts().head(10))

print("\n--- Ukraine: Context of Incidents ---")
print(alerts_cleaned_df[alerts_cleaned_df['country'] == 'Ukraine']['context_of_incident'].value_counts().head(10))

# Armed conflict specifically
print("\n--- Incidents in Armed Conflict Zones ---")
armed_conflict = alerts_cleaned_df[alerts_cleaned_df['context_of_incident'].str.contains('Armed conflict', na=False)]
print(f"Total armed conflict incidents: {len(armed_conflict)}")
print(armed_conflict['country'].value_counts())


--- Turkey: Perpetrator Breakdown ---
primary_source
Police/state security                                         491
Judiciary: court/judicial                                     359
Legislation: government/public official(s)                    277
Private individual(s)                                          97
Unknown source of incident                                     62
no data collected                                              54
Employer/publisher/colleague(s)                                39
Media regulatory authority                                     38
Public entity or authority (like tax or health department)     34
Corporation/company                                            18
Media outlet                                                   10
Criminal organisation                                           8
Political party                                                 8
Private security                                                6
Other source of incide