# Part 1: Estimating the Total Number of English Words Ever Written by Humans
$$w(t) = \int_{t_0}^{t} W(t)$$
where:

$t_0$: The start date (in our case 1500)

$t_a$: The end date of interest

$$W(t)=H(t)*E(t)*G(t)$$

where:

$H(t)$: The number of humans

$E(t)$: The fraction of humans being literate English speakers

$G(t)$: The number of words generated by the average literate English speaker

## Number of Humans
Thankfully, the human population number has been very closely tracked in the past and very smart people are projecting the growth into the future. We will take advantage of the data available and use it to interpolate a function for our date range of interest

In [101]:
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline
import plotly.graph_objects as go

population_df = pd.read_csv('../data/population.csv')[['Year', 'Population']]
population_df.tail()

def plot_against_data(function, x_range, xdata, ydata, title, yaxis_title, xaxis_title="t"):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=xdata,
        y=ydata,
        mode='markers',
        name='Data Points',
        line=dict(width=2)
    ))
    fig.add_trace(go.Scatter(
        x=x_range,
        y=function(x_range),
        mode='lines',
        name='Modelled Function',
        line=dict(width=2)
    ))
    fig.update_layout(
        title=title,
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
        hovermode="x unified",
        template="plotly_white",  # or use another template like "plotly_dark"
        showlegend=True
    )
    return fig

In [102]:
population_data_x = population_df['Year'].values
population_data_y = population_df['Population'].values

P = CubicSpline(population_data_x, population_data_y)

x_smooth = np.linspace(population_data_x.min(), population_data_x.max(), 500)
y_smooth = P(x_smooth)

total_years = np.arange(1500,2100)

# Create the figure
fig = plot_against_data(P, total_years, population_data_x, population_data_y, 'Population', 'Cumulative Population', 'Cumulative Population')

fig.show()

# English Speaking Fraction

In [107]:
english_speakers_data_x = np.array([1950, 2019, 2020, 2021, 2022, 2023, 2025])
english_speakers_total = np.array([249000000, 1130000000, 1270000000, 1350000000, 1450000000, 1460000000, 1528000000])
english_speakers_data_y = english_speakers_total / P(english_speakers_data_x)
def E(t, E_max=0.33, k_E=0.014, t_E=2016):
    return E_max / (1 + np.exp(-k_E * (t - t_E)))

# Time range (1950 to 2100)
total_years = np.arange(1500, 2101)
fractions = E(total_years)

# Create plot
fig = plot_against_data(E, total_years, english_speakers_data_x, english_speakers_data_y, 'Literate English Speaking Fraction', 'Fraction of World Population')
fig.show()

In [119]:
def G(t, G_max=1500, k_G=0.06, t_G=1980):
    return G_max / (1 + np.exp(-k_G * (t - t_G))) + 15

human_words_data_x = [1700, 1840, 1900, 1999, 2005, 2025]
human_words_data_y=[15, 25, 40, 1100, 1400, 1500]

fig = plot_against_data(G, total_years, human_words_data_x, human_words_data_y, 'Words per Literate Person', '# Words')
fig.show()

In [120]:
def W(t):
    return P(t)*E(t)*G(t)

fig = plot_against_data(W, total_years, [2025], [2.2e12], "Total English Words Generated", "# English Words Generated")
fig.show()