# Part 1: Estimating the Total Number of English Words Ever Written by Humans
$$w(t) = \int_{t_0}^{t} W(t)$$
where:

$t_0$: The start date (in our case 1500)

$t_a$: The end date of interest

$$W(t)=H(t)*E(t)*G(t)$$

where:

$H(t)$: The number of humans

$E(t)$: The fraction of humans being literate English speakers

$G(t)$: The number of words generated by the average literate English speaker

## Number of Humans
Thankfully, the human population number has been very closely tracked in the past and very smart people are projecting the growth into the future. We will take advantage of the data available and use it to interpolate a function for our date range of interest

In [49]:
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline
from utils import plot_against_data

population_df = pd.read_csv('../public/population.csv')[['Year', 'Population']]
population_df.tail()

Unnamed: 0,Year,Population
172,2096,10225850874
173,2097,10215549310
174,2098,10204489862
175,2099,10192689066
176,2100,10180160751


In [50]:
population_data_x = population_df['Year'].values
population_data_y = population_df['Population'].values

P = CubicSpline(population_data_x, population_data_y)

x_smooth = np.linspace(population_data_x.min(), population_data_x.max(), 500)
y_smooth = P(x_smooth)

total_years = np.arange(1500,2050)

# Create the figure
fig = plot_against_data(P, total_years, population_data_x, population_data_y, 'Population', 'Cumulative Population', 'Cumulative Population')

fig.show()

# English Speaking Fraction

In [60]:
english_speakers_data_x = np.array([1950, 2019, 2020, 2021, 2022, 2023, 2025])
english_speakers_total = np.array([249000000, 1130000000, 1270000000, 1350000000, 1450000000, 1460000000, 1528000000])
english_speakers_data_y = english_speakers_total / P(english_speakers_data_x)
for i, _ in enumerate(english_speakers_total):
    print(english_speakers_total[i]/P(english_speakers_data_x[i]))
def E(t, E_max=0.33, k_E=0.014, t_E=2016):
    return E_max / (1 + np.exp(-k_E * (t - t_E)))

# Create plot
fig = plot_against_data(E, total_years, english_speakers_data_x, english_speakers_data_y, 'Literate English Speaking Fraction', 'Fraction of World Population')
fig.show()

0.09987594521155573
0.14466233779069992
0.16102444506610503
0.1697163580053262
0.18076628911385684
0.18043102332483213
0.18562582898469498


In [47]:
def G(t, G_max=1600, k_G=0.06, t_G=1985, ai_start_year=2023, ai_dampening_rate=0.005):
    t = np.array(t)  # Ensure t is a NumPy array for vectorized operations
    human_words = G_max / (1 + np.exp(-k_G * (t - t_G))) + 15

    # Apply AI dampening only for years after AI start year
    years_since_ai = np.maximum(0, t - ai_start_year)
    dampening_factor = (1 - ai_dampening_rate) ** years_since_ai
    human_words *= dampening_factor

    return human_words

human_words_data_x = [1700, 1840, 1900, 1999, 2005, 2025]
human_words_data_y=[15, 25, 40, 1100, 1400, 1500]

fig = plot_against_data(G, total_years, human_words_data_x, human_words_data_y, 'Words per Literate Person', '# Words')
fig.show()

In [45]:
def W(t):
    return P(t)*E(t)*G(t)

fig = plot_against_data(W, total_years, [2025], [2.2e12], "Total English Words Generated", "# English Words Generated")
fig.show()