# Joshua Project Dataset -- Exploration Notebook

**16,382 people groups, 238 countries, 7,134 languages, 38 summary stats**

Author: [Luke Steuber](https://lukesteuber.com) | Bluesky: [@lukesteuber.com](https://bsky.app/profile/lukesteuber.com)

Dataset: [lukeslp/joshua-project-data](https://huggingface.co/datasets/lukeslp/joshua-project-data)

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Load all 4 datasets
with open('joshua_project_full_dump.json') as f:
    peoples = json.load(f)
with open('joshua_project_countries.json') as f:
    countries = json.load(f)
with open('joshua_project_languages.json') as f:
    languages = json.load(f)
with open('joshua_project_totals.json') as f:
    totals = json.load(f)

df_peoples = pd.DataFrame(peoples)
df_countries = pd.DataFrame(countries)
df_languages = pd.DataFrame(languages)
df_totals = pd.DataFrame(totals)

print(f"People groups: {len(df_peoples):,}")
print(f"Countries:     {len(df_countries):,}")
print(f"Languages:     {len(df_languages):,}")
print(f"Summary stats: {len(df_totals):,}")
print(f"\nPeople group columns: {list(df_peoples.columns)[:15]}...")

## Religion Distribution Across Countries

What is the primary religion breakdown across all 238 countries?

In [None]:
# Religion columns in countries data
religion_cols = ['PercentBuddhism', 'PercentChristianity', 'PercentEthnicReligions',
                 'PercentHinduism', 'PercentIslam', 'PercentNonReligious',
                 'PercentOtherSmall', 'PercentUnknown']

# Primary religion count
primary_religion = df_countries['ReligionPrimary'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# Primary religion pie
colors_pie = ['#2196F3', '#4CAF50', '#FF9800', '#E91E63', '#9C27B0', '#00BCD4', '#795548', '#607D8B']
axes[0].pie(primary_religion.values, labels=primary_religion.index,
            autopct='%1.1f%%', colors=colors_pie[:len(primary_religion)], startangle=90)
axes[0].set_title('Primary Religion by Country Count')

# Average religion percentages globally
rel_labels = [c.replace('Percent', '') for c in religion_cols]
for col in religion_cols:
    df_countries[col] = pd.to_numeric(df_countries[col], errors='coerce')
rel_means = df_countries[religion_cols].mean().values
bars = axes[1].barh(rel_labels, rel_means, color=colors_pie[:len(rel_labels)])
axes[1].set_xlabel('Average Percentage Across Countries')
axes[1].set_title('Average Religion Percentage Per Country')
for bar, val in zip(bars, rel_means):
    axes[1].text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2,
                 f'{val:.1f}%', va='center', fontsize=9)

plt.tight_layout()
plt.show()

## People Groups by Region

How are the 16,382 people groups distributed across continents and regions?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# By continent
continent_counts = df_peoples['Continent'].value_counts()
axes[0].barh(range(len(continent_counts)), continent_counts.values, color='#3F51B5')
axes[0].set_yticks(range(len(continent_counts)))
axes[0].set_yticklabels(continent_counts.index)
axes[0].set_xlabel('Number of People Groups')
axes[0].set_title('People Groups by Continent')
for i, val in enumerate(continent_counts.values):
    axes[0].text(val + 50, i, f'{val:,}', va='center', fontsize=9)

# By region (top 10)
region_counts = df_peoples['RegionName'].value_counts().head(10)
axes[1].barh(range(len(region_counts)), region_counts.values, color='#00BCD4')
axes[1].set_yticks(range(len(region_counts)))
axes[1].set_yticklabels(region_counts.index)
axes[1].set_xlabel('Number of People Groups')
axes[1].set_title('Top 10 Regions by People Group Count')
axes[1].invert_yaxis()
for i, val in enumerate(region_counts.values):
    axes[1].text(val + 20, i, f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

## Language Families

What does the Bible translation status look like across languages?

In [None]:
# Bible status distribution
df_languages['BibleStatus'] = pd.to_numeric(df_languages['BibleStatus'], errors='coerce')
bible_status_labels = {
    0: 'Unspecified',
    1: 'Translation Needed',
    2: 'Translation Started',
    3: 'Portions Available',
    4: 'New Testament',
    5: 'Complete Bible'
}
bible_counts = df_languages['BibleStatus'].value_counts().sort_index()
labels = [bible_status_labels.get(int(k), f'Status {k}') for k in bible_counts.index if not pd.isna(k)]
values = [bible_counts[k] for k in bible_counts.index if not pd.isna(k)]

fig, axes = plt.subplots(1, 2, figsize=(16, 7))

colors_bible = ['#757575', '#F44336', '#FF9800', '#FFC107', '#8BC34A', '#4CAF50']
axes[0].bar(range(len(labels)), values, color=colors_bible[:len(labels)])
axes[0].set_xticks(range(len(labels)))
axes[0].set_xticklabels(labels, rotation=30, ha='right')
axes[0].set_ylabel('Number of Languages')
axes[0].set_title('Bible Translation Status Across 7,134 Languages')
for i, v in enumerate(values):
    axes[0].text(i, v + 30, f'{v:,}', ha='center', fontsize=9)

# Languages with Jesus Film
jf = df_languages['HasJesusFilm'].value_counts()
axes[1].pie(jf.values, labels=[f'Has Film ({jf.get("Y",0):,})', f'No Film ({jf.get("N",0):,})'],
            autopct='%1.1f%%', colors=['#4CAF50', '#F44336'], startangle=90)
axes[1].set_title('Languages with Jesus Film Available')

plt.tight_layout()
plt.show()

## Population of People Groups

Which people groups are the largest, and how does population distribute?

In [None]:
df_peoples['Population'] = pd.to_numeric(df_peoples['Population'], errors='coerce')

# Top 20 largest people groups
top_pop = df_peoples.nlargest(20, 'Population')[['PeopNameInCountry', 'ROG3', 'Population', 'Continent']]

fig, ax = plt.subplots(figsize=(14, 8))
bars = ax.barh(range(len(top_pop)), top_pop['Population'].values, color='#E91E63')
labels = [f"{row['PeopNameInCountry']} ({row['ROG3']})" for _, row in top_pop.iterrows()]
ax.set_yticks(range(len(top_pop)))
ax.set_yticklabels(labels)
ax.set_xlabel('Population')
ax.set_title('Top 20 Largest People Groups')
ax.invert_yaxis()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x/1e6:.0f}M'))

for bar, val in zip(bars, top_pop['Population'].values):
    ax.text(bar.get_width() + 1e6, bar.get_y() + bar.get_height()/2,
            f'{val/1e6:.1f}M', va='center', fontsize=9)

plt.tight_layout()
plt.show()

total_pop = df_peoples['Population'].sum()
print(f"Total population across all people groups: {total_pop/1e9:.2f} billion")

## Unreached People Groups

The Joshua Project tracks which people groups are considered 'unreached' -- less than 2% evangelical Christian and less than 5% Christian adherent.

In [None]:
# Unreached analysis
unreached = df_peoples[df_peoples['LeastReached'] == 'Y']
reached = df_peoples[df_peoples['LeastReached'] == 'N']

fig, axes = plt.subplots(1, 2, figsize=(16, 7))

# Overall ratio
sizes = [len(unreached), len(reached)]
axes[0].pie(sizes, labels=[f'Unreached ({len(unreached):,})', f'Reached ({len(reached):,})'],
            autopct='%1.1f%%', colors=['#F44336', '#4CAF50'], startangle=90,
            explode=(0.05, 0))
axes[0].set_title('People Groups -- Reached vs Unreached')

# Unreached by continent
unreached_by_continent = unreached['Continent'].value_counts()
axes[1].barh(range(len(unreached_by_continent)), unreached_by_continent.values, color='#F44336')
axes[1].set_yticks(range(len(unreached_by_continent)))
axes[1].set_yticklabels(unreached_by_continent.index)
axes[1].set_xlabel('Number of Unreached People Groups')
axes[1].set_title('Unreached People Groups by Continent')
for i, val in enumerate(unreached_by_continent.values):
    axes[1].text(val + 20, i, f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

unreached_pop = unreached['Population'].sum()
print(f"Unreached population: {unreached_pop/1e9:.2f} billion ({100*unreached_pop/total_pop:.1f}% of total)")

## Affinity Blocs and People Clusters

People groups are organized into affinity blocs -- broad cultural groupings.

In [None]:
bloc_counts = df_peoples['AffinityBloc'].value_counts()

fig, ax = plt.subplots(figsize=(14, 7))
colors_bloc = plt.cm.Set2(np.linspace(0, 1, len(bloc_counts)))
ax.barh(range(len(bloc_counts)), bloc_counts.values, color=colors_bloc)
ax.set_yticks(range(len(bloc_counts)))
ax.set_yticklabels(bloc_counts.index)
ax.set_xlabel('Number of People Groups')
ax.set_title('People Groups by Affinity Bloc')
ax.invert_yaxis()

for i, val in enumerate(bloc_counts.values):
    ax.text(val + 20, i, f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print(f"\nAffinity blocs: {len(bloc_counts)}")
print(f"People clusters: {df_peoples['PeopleCluster'].nunique()}")

## Joshua Project Scale

The JP Scale rates progress on a 1-5 scale for each people group.

In [None]:
df_peoples['JPScale'] = pd.to_numeric(df_peoples['JPScale'], errors='coerce')
jp_scale = df_peoples['JPScale'].value_counts().sort_index()

scale_labels = {
    1: '1 -- Unreached',
    2: '2 -- Minimally Reached',
    3: '3 -- Superficially Reached',
    4: '4 -- Partially Reached',
    5: '5 -- Significantly Reached'
}

fig, ax = plt.subplots(figsize=(10, 6))
scale_colors = ['#B71C1C', '#E53935', '#FF9800', '#8BC34A', '#2E7D32']
bars = ax.bar([scale_labels.get(int(k), str(k)) for k in jp_scale.index],
              jp_scale.values, color=scale_colors[:len(jp_scale)])
ax.set_ylabel('Number of People Groups')
ax.set_title('Joshua Project Scale Distribution')
plt.xticks(rotation=15, ha='right')

for bar, val in zip(bars, jp_scale.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
            f'{val:,}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("=" * 60)
print("JOSHUA PROJECT DATASET -- SUMMARY")
print("=" * 60)
print(f"People groups:            {len(df_peoples):>10,}")
print(f"Countries:                {len(df_countries):>10,}")
print(f"Languages:                {len(df_languages):>10,}")
print(f"Continents:               {df_peoples['Continent'].nunique():>10}")
print(f"Affinity blocs:           {df_peoples['AffinityBloc'].nunique():>10}")
print(f"People clusters:          {df_peoples['PeopleCluster'].nunique():>10}")
print(f"Total population:         {df_peoples['Population'].sum()/1e9:>10.2f}B")
print(f"Unreached groups:         {len(unreached):>10,}")
print(f"Unreached population:     {unreached['Population'].sum()/1e9:>10.2f}B")
print()
print("Global summary totals:")
for _, row in df_totals.iterrows():
    print(f"  {row['id']}: {row['Value']}")
print("=" * 60)