# OpenStreetMap Changeset Discussions

This notebook analyzes discussions and comments on OpenStreetMap changesets. Changeset discussions allow mappers to communicate about specific changes, ask questions, report issues or provide feedback.

In [1]:
import duckdb
import util

util.init()
duckdb.sql("SET TimeZone='UTC'")

## Monthly Discussion Activity

In [2]:
df = duckdb.sql("""
WITH comment_dates AS (
    SELECT
        YEAR(date) as year,
        MONTH(date) as month,
        CONCAT(CAST(YEAR(date) AS VARCHAR), '-', LPAD(CAST(MONTH(date) AS VARCHAR), 2, '0')) as months,
        user_name,
        changeset_id,
        text
    FROM '../changeset_comments_data/*.parquet'
),
user_first_comment AS (
    SELECT
        user_name,
        year,
        month,
        ROW_NUMBER() OVER (PARTITION BY user_name ORDER BY year, month) as rn
    FROM (
        SELECT DISTINCT user_name, year, month
        FROM comment_dates
    )
),
first_commenters AS (
    SELECT user_name, year, month
    FROM user_first_comment
    WHERE rn = 1
),
monthly_metrics AS (
    SELECT
        year,
        month,
        months,
        COUNT(*) as "Comments",
        COUNT(DISTINCT user_name) as "Commenters",
        COUNT(DISTINCT changeset_id) as "Changesets with Comments"
    FROM comment_dates
    GROUP BY year, month, months
),
monthly_new_commenters AS (
    SELECT
        year,
        month,
        COUNT(DISTINCT user_name) as "New Commenters"
    FROM first_commenters
    GROUP BY year, month
),
combined_metrics AS (
    SELECT
        m.year,
        m.month,
        m.months,
        m.Comments,
        m.Commenters,
        COALESCE(n."New Commenters", 0) as "New Commenters",
        m."Changesets with Comments"
    FROM monthly_metrics m
    LEFT JOIN monthly_new_commenters n ON m.year = n.year AND m.month = n.month
)
SELECT
    months,
    Comments,
    Commenters,
    "New Commenters",
    "Changesets with Comments",
    SUM(Comments) OVER (ORDER BY year, month) as "Accumulated Comments",
    SUM(Commenters) OVER (ORDER BY year, month) as "Accumulated Commenters"
FROM combined_metrics
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Comments",
            label="Comments",
            x_col="months",
            y_col="Comments",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Commenters",
            label="Commenters",
            x_col="months",
            y_col="Commenters",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly New Commenters",
            label="New Commenters",
            x_col="months",
            y_col="New Commenters",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Changesets with Comments",
            label="Changesets",
            x_col="months",
            y_col="Changesets with Comments",
            query_or_df=df,
        ),
    ]
)

## Accumulated Discussion Activity

In [3]:
util.show_figure(
    [
        util.FigureConfig(
            title="Accumulated Comments",
            label="Accumulated Comments",
            x_col="months",
            y_col="Accumulated Comments",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Commenters",
            label="Accumulated Commenters",
            x_col="months",
            y_col="Accumulated Commenters",
            query_or_df=df,
        ),
    ]
)

## Comment Length Distribution

In [4]:
df = duckdb.sql("""
WITH comment_lengths AS (
    SELECT
        YEAR(date) as year,
        MONTH(date) as month,
        CONCAT(CAST(YEAR(date) AS VARCHAR), '-', LPAD(CAST(MONTH(date) AS VARCHAR), 2, '0')) as months,
        LENGTH(text) as comment_length
    FROM '../changeset_comments_data/*.parquet'
    WHERE text IS NOT NULL
)
SELECT
    months,
    CAST(AVG(comment_length) as INTEGER) as "Average Comment Length",
    CAST(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY comment_length) as INTEGER) as "Median Comment Length"
FROM comment_lengths
GROUP BY year, month, months
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Average Comment Length (Characters)",
            label="Average Length",
            x_col="months",
            y_col="Average Comment Length",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Median Comment Length (Characters)",
            label="Median Length",
            x_col="months",
            y_col="Median Comment Length",
            query_or_df=df,
        ),
    ]
)