## CJP data exploration & preprocessing

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
PROJECT_ROOT = Path.cwd().parents[1]  # from notebooks/target-exploration
csv_path = PROJECT_ROOT / "data" / "raw" / "target" / "cjp-historical-killings.csv"

cjp_df = pd.read_csv(csv_path)

In [3]:
cjp_df.shape

(2550, 8)

In [4]:
cjp_df.head()

Unnamed: 0,Name,Status,Date,Country,Journalist or Media Worker,Motive,Type of Death,cpj.org URL
0,Abadullah Hananzai,Killed,"April 30, 2018",Afghanistan,"Radio Azadi,Radio Free Europe/Radio Liberty",Confirmed,Murder,https://cpj.org/data/people/abadullah-hananzai/
1,Abay Hailu,Killed,"February 9, 1998",Ethiopia,Agiere,Confirmed,Dangerous Assignment,https://cpj.org/data/people/abay-hailu/
2,Abbas al-Dailami,Killed,"September 10, 2025",Yemen,"26 September, Yemen",Confirmed,Murder,https://cpj.org/data/people/abbas-al-dailami/
3,Abd al-Karim al-Ezzo,Killed,"December 21, 2012",Syria,Freelance,Confirmed,Crossfire,https://cpj.org/data/people/abd-al-karim-al-ezzo/
4,Abdallah Alwan,Killed,"December 18, 2023",Israel and the Occupied Palestinian Territory,"Holy Quran Radio,Midan,Mugtama,Al-Jazeera",Confirmed,Dangerous Assignment,https://cpj.org/data/people/abdallah-alwan/


In [5]:
cjp_df.isna().sum().sort_values(ascending=False)

(cjp_df.isna().mean() * 100).sort_values(ascending=False)

cjp_df.info()

cjp_df[cjp_df.isna().any(axis=1)]

cjp_df.isna().sum().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Name                        2550 non-null   object
 1   Status                      2550 non-null   object
 2   Date                        2550 non-null   object
 3   Country                     2550 non-null   object
 4   Journalist or Media Worker  2547 non-null   object
 5   Motive                      2550 non-null   object
 6   Type of Death               2230 non-null   object
 7   cpj.org URL                 2550 non-null   object
dtypes: object(8)
memory usage: 159.5+ KB


323

In [6]:
# Parse and aggregate
cjp_df['year'] = pd.to_datetime(cjp_df['Date'], format='%B %d, %Y').dt.year

target_df = (cjp_df
    .groupby(['Country', 'year'])
    .size()
    .reset_index(name='journalist_killings'))

PROJECT_ROOT = Path.cwd().parents[1]  # if running from notebooks/target-exploration
out_path = PROJECT_ROOT / "data" / "processed" / "target" / "target_journalist_killings.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)  # safe even if already exists

target_df.to_csv(out_path, index=False)

print(target_df.shape)
target_df.head()

(763, 3)


Unnamed: 0,Country,year,journalist_killings
0,Afghanistan,1994,1
1,Afghanistan,1998,1
2,Afghanistan,2001,9
3,Afghanistan,2006,3
4,Afghanistan,2007,3


In [7]:
target_df['year'].min(), target_df['year'].max()

(1992, 2026)