# How many people are contributing to OSM?

Go [here](https://nbviewer.org/github/piebro/openstreetmap-statistics/blob/master/src/questions/how_many_people_are_contributing_to_osm/calculations.ipynb) to see the same notebook with rendered outputs.

In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

sys.path.append(str(Path.resolve(Path.cwd() / ".." / "..")))

import new_util as util

DATA_DIR = "../../../temp"
MONTHS, YEARS = util.get_months_years(DATA_DIR)
TIME_DICT = util.get_month_year_dicts(DATA_DIR)

util.reset_data_and_plots()

## Contibutors per Month

In [2]:
ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "user_index"])
contributors_unique_monthly = ddf.groupby(["month_index"], observed=False)["user_index"].unique().rename("contributors").compute()

util.save_data(DATA_DIR, "general_contributor_count_monthly", contributors_unique_monthly.apply(len))

In [3]:
plot_config = util.get_plot_config("general_contributor_count_monthly", "contributors per month")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

## New Contributors per Month

In [4]:
util.save_data(DATA_DIR, "general_new_contributor_count_monthly", util.cumsum_new_nunique(contributors_unique_monthly))

In [5]:
plot_config = util.get_plot_config("general_new_contributor_count_monthly", "new contributors per month", y_unit="new contributors")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

In [6]:
util.save_accumulated("general_new_contributor_count_monthly")
plot_config = util.get_plot_config("general_new_contributor_count_monthly_accumulated", "total contributor count", y_unit="total contributor count")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

## contributors with more the k edits

In [7]:
ddf = util.load_ddf(DATA_DIR, "general", ["month_index", "edits", "user_index"])
total_edits_of_contributors = ddf.groupby(["user_index"], observed=False)["edits"].sum().compute()
contributors_unique_monthly_set = contributors_unique_monthly.apply(set)
contributor_count_more_the_k_edits_monthly = []
for k in [10, 100, 1_000, 10_000, 100_000]:
    contributors_with_more_than_k_edits = set(
        total_edits_of_contributors[total_edits_of_contributors > k].index.to_numpy(),
    )
    contributor_count_more_the_k_edits_monthly.append(
        contributors_unique_monthly_set.apply(
            lambda x: len(x.intersection(contributors_with_more_than_k_edits)),
        ).rename(f"more then {k} edits"),
    )
util.save_data(
    DATA_DIR,
    "general_contributor_count_more_the_k_edits_monthly",
    pd.concat(contributor_count_more_the_k_edits_monthly, axis=1).reset_index()
)

In [8]:
plot_config = util.get_plot_config("general_contributor_count_more_the_k_edits_monthly", "contributors with more than k edits total", y_unit="contributors")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

## contributors per month without maps.me

In [9]:
created_by_tag_to_index = util.load_tag_to_index(DATA_DIR, "created_by")
editor_indices = np.array([created_by_tag_to_index["MAPS.ME"]])
ddf_created_by = util.load_ddf(DATA_DIR, "general", ["created_by"])
ddf_without_editors = ddf[~ddf_created_by["created_by"].isin(editor_indices)]
util.save_data(
    DATA_DIR,
    "general_no_maps_me_contributor_count_monthly",
    ddf_without_editors.groupby(["month_index"], observed=False)["user_index"].nunique().rename("contributors without maps.me").compute()
)

In [10]:
plot_config = util.get_plot_config("general_no_maps_me_contributor_count_monthly", "contributors per month without maps.me contributors")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

## median number of edits per contributor per month

In [11]:
edit_count_per_contributor_median_monthly = []
for i in range(len(MONTHS)):
    filters = [("month_index", "==", i)]
    ddf = util.load_ddf(DATA_DIR, "general", ("edits", "user_index"), filters)
    contributor_edits_month = ddf.groupby(["user_index"], observed=False)["edits"].sum().compute()
    edit_count_per_contributor_median_monthly.append(contributor_edits_month.median())
df = pd.DataFrame({"median number of edits per contributor": edit_count_per_contributor_median_monthly})
df.index.name = "month_index"
util.save_data(
    DATA_DIR,
    "general_edit_count_per_contributor_median_monthly",
    df.reset_index()
)

In [12]:
plot_config = util.get_plot_config("general_edit_count_per_contributor_median_monthly", "median number of edits per contributor per month")
util.save_plot_config(plot_config)
util.show_plot(plot_config)

## further ideas

In [13]:
# Observation: There are a lot of new contributors per month, but they don't seem to stick around for that long.
# Because each month around a third of all contributors seem to be new. That's strange.
# IDEA: Show this in a plot
# IDEA: yearly artition rate contributors?
# IDEA: monthly artition rate from the last 12 Month.

# IDEA: make all the plots also available with "per year" maybe