# How many people are contributing to OSM?

In [49]:
import json
import warnings
from pathlib import Path

import dask.dataframe as dd
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import util

warnings.filterwarnings("ignore", category=FutureWarning, module="dask.dataframe.io.parquet.core")
DATA_DIR = "../temp"
MONTHS, YEARS = util.get_months_years(DATA_DIR)
TIME_DICT = util.get_month_year_dicts(DATA_DIR)

In [82]:
def load_ddf(tag, columns=None, filters=None):
    return dd.read_parquet(Path(DATA_DIR) / "changeset_data" / tag, columns=columns, filters=filters)

def save_data(name, pd_df_or_series):
    pd_df = pd_df_or_series.reset_index() if isinstance(pd_df_or_series, pd.Series) else pd_df_or_series

    start_index_offset = None
    if "month_index" in pd_df:
        pd_df.insert(0, "months", pd_df["month_index"].map(TIME_DICT["month"]))
        pd_df = pd_df.drop(columns=["month_index"])
        start_index_offset = 3
    elif "year_index" in pd_df:
        pd_df.insert(0, "years", pd_df["year_index"].map(TIME_DICT["year"]))
        pd_df = pd_df.drop(columns=["year_index"])
        start_index_offset = 0
    columns = list(pd_df.columns)

    # delete rows with only zeros in them
    if start_index_offset is not None:
        value_columns = [c for c in columns if c not in ["months", "years"]]
        if_zero_row = (pd_df[value_columns] != 0).any(axis=1).to_numpy()

        if len(np.nonzero(if_zero_row)[0]) > 0:
            start_index = np.max([0, np.nonzero(if_zero_row)[0][0] - start_index_offset])
            pd_df = pd_df.loc[start_index:]

    pd_df.to_json(Path("../assets") / "data" / f"{name}.json", orient="split", index=False, indent=1)

def get_plot_config(filename, title, x, y, x_unit=None, y_unit=None):
    df = pd.read_json(Path("../assets") / "data" / f"{filename}.json", orient="split")
    if x_unit is None:
        x_unit = x
    if y_unit is None:
        y_unit = y
    return {
        "filename": filename,
        "layout": {
            "font": {"family": "Times", "size": 15},
            "paper_bgcolor": "#dfdfdf",
            "plot_bgcolor": "#dfdfdf",
            "margin": {"l": 55, "r": 55, "b": 55, "t": 55},
            "title": {"text": title, "x": 0.5, "xanchor": "center"},
            "xaxis": {"title": { "text": x_unit }, "gridcolor": "#d1d1d1", "linecolor": "#d1d1d1"},
            "yaxis": {"title": { "text": y_unit }, "gridcolor": "#d1d1d1", "rangemode": "tozero"},

            "autosize": False,
            "width": 1000,
            "height": 500,
        },
        "traces": [
            {
                "type": "scatter",
                "x": df[x].tolist(),
                "y": df[y].tolist(),
                "mode": "lines",
                "name": "",
                "hovertemplate": "%{x}<br>%{y:,} " + y_unit
            }
        ]
    }

def save_plot_config(config):
    with (Path("../assets") / "data" / "plot_configs.json").open("r", encoding="UTF-8") as json_file:
        configs = json.load(json_file)
    
    configs[config["filename"]] = config
    with (Path("../assets") / "data" / "plot_configs.json").open("w", encoding="UTF-8") as json_file:
        json.dump(configs, json_file, indent=4)
    

def show_plot(plot_config):
    #df = pd.read_json(Path("../assets") / "data" / f"{plot_config['filename']}.json", orient="split")
    fig = go.Figure()
    for trace in plot_config["traces"]:
        if trace["type"] == "scatter":
            #trace["x"] = df[trace["x"]]
            #trace["y"] = df[trace["y"]]
            fig.add_trace(go.Scatter(trace))

    fig.update_layout(plot_config["layout"])
    fig.show()

## Contibutors per Month

In [8]:
ddf = load_ddf("general", ["month_index", "user_index"])
contributors_unique_monthly = ddf.groupby(["month_index"], observed=False)["user_index"].unique().rename("contributors").compute()

save_data("general_contributor_count_monthly", contributors_unique_monthly.apply(len))

In [83]:
plot_config = get_plot_config("general_contributor_count_monthly", "contributors per month", "months", "contributors")
save_plot_config(plot_config)
show_plot(plot_config)

## New Contributors per Month

In [86]:
save_data("general_new_contributor_count_monthly", util.cumsum_new_nunique(contributors_unique_monthly))

In [87]:
plot_config = get_plot_config("general_new_contributor_count_monthly", "new contributors per month", "months", "contributors", y_unit="new contributors")
save_plot_config(plot_config)
show_plot(plot_config)

## contributors per month without maps.me

In [59]:
created_by_tag_to_index = util.load_tag_to_index(DATA_DIR, "created_by")
editor_indices = np.array([created_by_tag_to_index["MAPS.ME android"], created_by_tag_to_index["MAPS.ME ios"]])
ddf_created_by = load_ddf("general", ["created_by"])
ddf_without_editors = ddf[~ddf_created_by["created_by"].isin(editor_indices)]
save_data(
    "general_no_maps_me_contributor_count_monthly",
    ddf_without_editors.groupby(["month_index"], observed=False)["user_index"].nunique().rename("contributors without maps.me").compute()
)

In [85]:
plot_config = get_plot_config("general_no_maps_me_contributor_count_monthly", "contributors per month without maps.me contributors", "months", "contributors without maps.me")
save_plot_config(plot_config)
show_plot(plot_config)

## contributors with more the k edits

In [61]:
ddf = load_ddf("general", ["month_index", "edits", "user_index"])
total_edits_of_contributors = ddf.groupby(["user_index"], observed=False)["edits"].sum().compute()
contributors_unique_monthly_set = contributors_unique_monthly.apply(set)
contributor_count_more_the_k_edits_monthly = []
for k in [10, 100, 1_000, 10_000, 100_000]:
    contributors_with_more_than_k_edits = set(
        total_edits_of_contributors[total_edits_of_contributors > k].index.to_numpy(),
    )
    contributor_count_more_the_k_edits_monthly.append(
        contributors_unique_monthly_set.apply(
            lambda x: len(x.intersection(contributors_with_more_than_k_edits)),
        ).rename(f"more then {k} edits"),
    )
save_data(
    "general_contributor_count_more_the_k_edits_monthly",
    pd.concat(contributor_count_more_the_k_edits_monthly, axis=1).reset_index()
)

In [None]:
plot_config = get_plot_config("general_contributor_count_more_the_k_edits_monthly", "contributors with more than k edits total", "months", "contributors")
save_plot_config(plot_config)
show_plot(plot_config)

In [None]:
# Observation: There are a lot of new contributors per month, but they don't seem to stick around for that long.
# Because each month around a third of all contributors seem to be new. That's strange.
# TODO: Show this in a plot
# TODO: yearly artition rate contributors?
# TODO: monthly artition rate from the last 12 Month.