# 05 — Gun Homicide vs Population

Scatter plot with trend line exploring the correlation between country population and gun homicide rates.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats
from pathlib import Path

DATA_DIR = Path('../data/processed')
df = pd.read_csv(DATA_DIR / 'merged_country_data.csv')

# Filter to countries with both population and gun homicide data
plot_df = df.dropna(subset=['population', 'gun_homicide_rate']).copy()
print(f"Countries with both population and gun homicide data: {len(plot_df)}")

Countries with both population and gun homicide data: 161


## Scatter Plot — Gun Homicide Rate vs Population

In [2]:
# Linear regression on log-transformed values
plot_df['log_homicide'] = np.log10(plot_df['gun_homicide_rate'].clip(lower=0.01))
plot_df['log_pop'] = np.log10(plot_df['population'])

slope, intercept, r_value, p_value, std_err = stats.linregress(
    plot_df['log_pop'], plot_df['log_homicide']
)
r_squared = r_value ** 2

print(f"Linear regression (log population vs log gun homicide rate):")
print(f"  R² = {r_squared:.4f}")
print(f"  p-value = {p_value:.2e}")
print(f"  slope = {slope:.4f}")

Linear regression (log population vs log gun homicide rate):
  R² = 0.0000
  p-value = 9.60e-01
  slope = -0.0042


In [3]:
# Identify notable outliers for annotation
notable = ['USA', 'BRA', 'IND', 'CHN', 'JPN', 'VEN', 'JAM', 'HND']
annotate_df = plot_df[plot_df['country_code'].isin(notable)]

fig = px.scatter(
    plot_df,
    x='population',
    y='gun_homicide_rate',
    color='region',
    hover_name='country_name',
    hover_data={'population': ':,.0f', 'gun_homicide_rate': ':.2f', 'region': True},
    log_x=True,
    log_y=True,
    title=f'Gun Homicide Rate vs Population (R²={r_squared:.3f}, p={p_value:.2e})',
    labels={
        'population': 'Population (log scale)',
        'gun_homicide_rate': 'Gun Homicide Rate per 100K (log scale)',
        'region': 'Region',
    },
)

# Add trend line
x_range = np.linspace(plot_df['log_pop'].min(), plot_df['log_pop'].max(), 100)
y_trend = 10 ** (slope * x_range + intercept)
x_trend = 10 ** x_range
fig.add_trace(go.Scatter(
    x=x_trend, y=y_trend,
    mode='lines',
    name=f'Trend (R²={r_squared:.3f})',
    line=dict(color='red', dash='dash', width=2),
))

# Annotate notable countries
for _, row in annotate_df.iterrows():
    fig.add_annotation(
        x=np.log10(row['population']),
        y=np.log10(max(row['gun_homicide_rate'], 0.01)),
        text=row['country_name'],
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowcolor='#666',
        font=dict(size=10),
        ax=20,
        ay=-20,
    )

fig.update_layout(template='plotly_white', height=600)
fig.show()