# 01 â€” Data Collection & Cleaning

Fetch gun homicide rates, population, Gini coefficients, and drug offense rates.
Merge all datasets on ISO alpha-3 country codes and output a combined CSV.

In [1]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
from pathlib import Path
from data_utils import (
    fetch_population,
    fetch_gini,
    get_gun_homicide_rates,
    get_drug_offense_rates,
    get_gun_ownership_rates,
    get_gun_control_strictness,
    get_country_regions,
)

OUTPUT_DIR = Path('../data/processed')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

## Fetch Individual Datasets

In [2]:
# Gun homicide rates (UNODC)
gun_df = get_gun_homicide_rates()
print(f"Gun homicide data: {len(gun_df)} countries")
gun_df.head()

Gun homicide data: 161 countries


Unnamed: 0,country_code,country_name,gun_homicide_rate
0,AFG,Afghanistan,4.7
1,ALB,Albania,1.3
2,DZA,Algeria,0.5
3,AGO,Angola,4.8
4,ARG,Argentina,2.2


In [3]:
# Population (World Bank)
pop_df = fetch_population(year='2022')
print(f"Population data: {len(pop_df)} countries")
pop_df.head()

Using embedded population fallback data
Population data: 169 countries


Unnamed: 0,country_code,country_name,population
0,AFG,Afghanistan,41128771
1,ALB,Albania,2842321
2,DZA,Algeria,44903225
3,AGO,Angola,34503774
4,ARG,Argentina,46234830


In [4]:
# Gini coefficient (World Bank)
gini_df = fetch_gini(year_range='2018:2022')
print(f"Gini data: {len(gini_df)} countries")
gini_df.head()

Using embedded Gini fallback data
Gini data: 152 countries


Unnamed: 0,country_code,country_name,gini
0,ALB,Albania,30.0
1,DZA,Algeria,27.6
2,AGO,Angola,51.3
3,ARG,Argentina,42.3
4,ARM,Armenia,29.9


In [5]:
# Drug offense rates (UNODC)
drug_df = get_drug_offense_rates()
print(f"Drug offense data: {len(drug_df)} countries")
drug_df.head()

Drug offense data: 121 countries


Unnamed: 0,country_code,country_name,drug_offense_rate
0,ALB,Albania,45.0
1,DZA,Algeria,85.0
2,ARG,Argentina,102.0
3,ARM,Armenia,38.0
4,AUS,Australia,605.0


In [6]:
# Gun ownership rates (Small Arms Survey 2017)
ownership_df = get_gun_ownership_rates()
print(f"Gun ownership data: {len(ownership_df)} countries")
ownership_df.head()

Gun ownership data: 162 countries


Unnamed: 0,country_code,country_name,guns_per_100
0,AFG,Afghanistan,12.5
1,ALB,Albania,8.6
2,DZA,Algeria,7.1
3,AGO,Angola,3.0
4,ARG,Argentina,10.2


In [7]:
# Gun control strictness (custom ordinal scale 1-5)
control_df = get_gun_control_strictness()
print(f"Gun control strictness data: {len(control_df)} countries")
control_df.head()

Gun control strictness data: 162 countries


Unnamed: 0,country_code,country_name,gun_control_strictness
0,AFG,Afghanistan,1
1,ALB,Albania,3
2,DZA,Algeria,4
3,AGO,Angola,3
4,ARG,Argentina,3


## Merge All Datasets

In [8]:
# Start with gun homicide as the base
merged = gun_df.copy()

# Merge population
merged = merged.merge(
    pop_df[['country_code', 'population']],
    on='country_code', how='left'
)

# Merge Gini
merged = merged.merge(
    gini_df[['country_code', 'gini']],
    on='country_code', how='left'
)

# Merge drug offenses
merged = merged.merge(
    drug_df[['country_code', 'drug_offense_rate']],
    on='country_code', how='left'
)

# Merge gun ownership
merged = merged.merge(
    ownership_df[['country_code', 'guns_per_100']],
    on='country_code', how='left'
)

# Merge gun control strictness
merged = merged.merge(
    control_df[['country_code', 'gun_control_strictness']],
    on='country_code', how='left'
)

# Add region mapping
regions = get_country_regions()
merged = merged.merge(regions, on='country_code', how='left')
merged['region'] = merged['region'].fillna('Other')

print(f"Merged dataset: {len(merged)} countries")
merged.head(10)

Merged dataset: 161 countries


Unnamed: 0,country_code,country_name,gun_homicide_rate,population,gini,drug_offense_rate,guns_per_100,gun_control_strictness,region
0,AFG,Afghanistan,4.7,41128771,,,12.5,1.0,Sub-Saharan Africa
1,ALB,Albania,1.3,2842321,30.0,45.0,8.6,3.0,Eastern Europe
2,DZA,Algeria,0.5,44903225,27.6,85.0,7.1,4.0,Middle East & N. Africa
3,AGO,Angola,4.8,34503774,51.3,,3.0,3.0,Sub-Saharan Africa
4,ARG,Argentina,2.2,46234830,42.3,102.0,10.2,3.0,South America
5,ARM,Armenia,0.4,2780469,29.9,38.0,4.4,3.0,Eastern Europe
6,AUS,Australia,0.15,25978935,34.3,605.0,13.7,4.0,Oceania
7,AUT,Austria,0.15,9041851,30.5,340.0,30.0,3.0,Western Europe
8,AZE,Azerbaijan,0.6,10093121,26.6,42.0,3.6,4.0,Eastern Europe
9,BHS,Bahamas,24.5,409984,,310.0,5.3,3.0,Central America & Caribbean


## Data Coverage Summary

In [9]:
coverage = pd.DataFrame({
    'Metric': ['Gun Homicide Rate', 'Population', 'Gini Coefficient',
               'Drug Offense Rate', 'Gun Ownership', 'Gun Control Strictness'],
    'Countries with data': [
        merged['gun_homicide_rate'].notna().sum(),
        merged['population'].notna().sum(),
        merged['gini'].notna().sum(),
        merged['drug_offense_rate'].notna().sum(),
        merged['guns_per_100'].notna().sum(),
        merged['gun_control_strictness'].notna().sum(),
    ]
})
all_metrics = (
    merged['gun_homicide_rate'].notna() &
    merged['population'].notna() &
    merged['gini'].notna() &
    merged['drug_offense_rate'].notna() &
    merged['guns_per_100'].notna() &
    merged['gun_control_strictness'].notna()
).sum()
print(f"Countries with ALL six metrics: {all_metrics}")
print()
coverage

Countries with ALL six metrics: 118



Unnamed: 0,Metric,Countries with data
0,Gun Homicide Rate,161
1,Population,161
2,Gini Coefficient,147
3,Drug Offense Rate,121
4,Gun Ownership,160
5,Gun Control Strictness,160


In [10]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(
        x=coverage['Metric'],
        y=coverage['Countries with data'],
        marker_color=['#e74c3c', '#3498db', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']
    )
])
fig.update_layout(
    title='Data Coverage by Metric',
    yaxis_title='Number of Countries',
    template='plotly_white',
    height=400,
)
fig.show()

## Save Merged Dataset

In [11]:
out_path = OUTPUT_DIR / 'merged_country_data.csv'
merged.to_csv(out_path, index=False)
print(f"Saved merged data to {out_path}")
print(f"Shape: {merged.shape}")
merged.describe()

Saved merged data to ../data/processed/merged_country_data.csv
Shape: (161, 9)


Unnamed: 0,gun_homicide_rate,population,gini,drug_offense_rate,guns_per_100,gun_control_strictness
count,161.0,161.0,147.0,121.0,160.0,160.0
mean,3.423416,48702900.0,38.012925,131.958678,9.805,3.2125
std,5.742088,162016800.0,8.084056,128.53705,13.089344,0.788179
min,0.0,281200.0,24.6,8.0,0.2,1.0
25%,0.35,4614974.0,32.1,42.0,2.0,3.0
50%,1.4,11228820.0,36.7,85.0,5.4,3.0
75%,3.5,36408820.0,42.9,175.0,12.35,4.0
max,33.3,1417173000.0,63.0,605.0,120.5,5.0
