# Reporters without borders â€“ data collection

In [None]:
import re
import json
import pandas as pd
from datetime import date

In [None]:
with open("countries.json") as f:
    countries = json.load(f)

all_country_codes = set(item["country_code"] for item in countries)

In [None]:
dfs = []
clean_pattern = re.compile(r'[^a-zA-Z0-9]')

for year in range(2013, date.today().year):
    try:
        url = f"https://rsf.org/sites/default/files/import_classement/{year}.csv"
        df_year = pd.read_csv(
            url, 
            sep=';', 
            encoding='latin-1', 
            storage_options={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        )

        df_year.columns = [clean_pattern.sub('', c).lower() for c in df_year.columns]
        iso_col = next((c for c in df_year.columns if 'iso' in c), None)
        score_col = next((c for c in df_year.columns if 'score' in c and 'rank' not in c and 'context' not in c), None)

        if iso_col and score_col:
            df_year = df_year.rename(columns={iso_col: 'country_code', score_col: 'value'})
            df_year['value'] = (
                df_year['value']
                .astype(str)
                .str.replace(',', '.', regex=False)
                .astype(float)
            )
            
            df_year['year'] = year
            dfs.append(df_year[['country_code', 'year', 'value']])
        else:
            print(f"Skipping {year}: Could not identify ISO or Score column. Found: {df_year.columns.tolist()}")

    except Exception as e:
        print(f"Skipping {year} due to error: {e}")


press_freedom = pd.concat(dfs, ignore_index=True).dropna(
    subset=["country_code", "year"]
).replace(
    {float('nan'): None}
).loc[
    lambda row: row["country_code"].isin(all_country_codes)
].sort_values(
    ["country_code", "year"]
).to_dict("records")

# press_freedom

In [None]:
with open("data/press_freedom.json", "w") as f:
    json.dump(press_freedom, f, indent=2)