In [None]:
import duckdb

result = duckdb.sql("SELECT * FROM 'data/year=*/month=*/*.parquet'")

In [4]:
result = duckdb.sql("DESCRIBE 'data/year=*/month=*/*.parquet'").df()
print(result)

             column_name column_type null   key default extra
0             edit_count     INTEGER  YES  None    None  None
1             user_index     INTEGER  YES  None    None  None
2                  pos_x    SMALLINT  YES  None    None  None
3                  pos_y    SMALLINT  YES  None    None  None
4                 is_bot     BOOLEAN  YES  None    None  None
5            corporation     VARCHAR  YES  None    None  None
6             created_by     VARCHAR  YES  None    None  None
7           imagery_used   VARCHAR[]  YES  None    None  None
8               hashtags   VARCHAR[]  YES  None    None  None
9                 source   VARCHAR[]  YES  None    None  None
10  streetcomplete_quest     VARCHAR  YES  None    None  None
11             mobile_os     VARCHAR  YES  None    None  None
12              all_tags   VARCHAR[]  YES  None    None  None
13                 month      BIGINT  YES  None    None  None
14                  year      BIGINT  YES  None    None  None


In [5]:
import duckdb

# Quick column overview
columns = duckdb.sql("SELECT * FROM 'data/year=*/month=*/*.parquet' LIMIT 0").columns
types = duckdb.sql("SELECT * FROM 'data/year=*/month=*/*.parquet' LIMIT 0").types

for col, dtype in zip(columns, types):
    print(f"{col}: {dtype}")

edit_count: INTEGER
user_index: INTEGER
pos_x: SMALLINT
pos_y: SMALLINT
is_bot: BOOLEAN
corporation: VARCHAR
created_by: VARCHAR
imagery_used: VARCHAR[]
hashtags: VARCHAR[]
source: VARCHAR[]
streetcomplete_quest: VARCHAR
mobile_os: VARCHAR
all_tags: VARCHAR[]
month: BIGINT
year: BIGINT


In [9]:
duckdb.sql("SELECT * FROM 'data/year=*/month=*/*.parquet' LIMIT 1").df()

Unnamed: 0,edit_count,user_index,pos_x,pos_y,is_bot,corporation,created_by,imagery_used,hashtags,source,streetcomplete_quest,mobile_os,all_tags,month,year
0,4,32,180,142,False,,,,,,,,[],10,2005


In [10]:
import duckdb

result = duckdb.sql("""
    SELECT 
        year,
        month,
        COUNT(DISTINCT user_index) as unique_users,
        COUNT(*) as total_records
    FROM 'data/year=*/month=*/*.parquet'
    GROUP BY year, month
    ORDER BY year, month
""").df()

# Add a formatted date column if you want
result['date'] = result['year'].astype(str) + '-' + result['month'].astype(str).str.zfill(2)
print(result)

     year  month  unique_users  total_records     date
0    2005      4             4              8  2005-04
1    2005      5             5             45  2005-05
2    2005      6             8             49  2005-06
3    2005      7            16             97  2005-07
4    2005      8            22            107  2005-08
..    ...    ...           ...            ...      ...
200  2021     12         40051        1202017  2021-12
201  2022      1         41972        1219838  2022-01
202  2022      2         40240        1118106  2022-02
203  2022      3         43967        1218111  2022-03
204  2022      4         35231         847977  2022-04

[205 rows x 5 columns]


In [13]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Convert date column to datetime for better plotting
result['date'] = pd.to_datetime(result['date'])

# Method 1: Simple line plot of unique users over time
fig1 = px.line(result, x='date', y='unique_users', 
               title='Unique Users Over Time',
               labels={'unique_users': 'Number of Unique Users', 'date': 'Date'})
fig1.show()

# Method 2: Dual y-axis plot (unique users + total records)
fig2 = make_subplots(specs=[[{"secondary_y": True}]])

fig2.add_trace(
    go.Scatter(x=result['date'], y=result['unique_users'], name="Unique Users"),
    secondary_y=False,
)

fig2.add_trace(
    go.Scatter(x=result['date'], y=result['total_records'], name="Total Records"),
    secondary_y=True,
)

fig2.update_xaxes(title_text="Date")
fig2.update_yaxes(title_text="Unique Users", secondary_y=False)
fig2.update_yaxes(title_text="Total Records", secondary_y=True)
fig2.update_layout(title_text="User Activity Over Time")
fig2.show()

# Method 3: Bar chart
fig3 = px.bar(result, x='date', y='unique_users',
              title='Monthly Unique Users',
              labels={'unique_users': 'Number of Unique Users'})
fig3.show()

# Method 4: Add average edits per user
result['avg_edits_per_user'] = result['total_records'] / result['unique_users']

fig4 = px.line(result, x='date', y='avg_edits_per_user',
               title='Average Edits per User Over Time',
               labels={'avg_edits_per_user': 'Average Edits per User'})
fig4.show()

In [None]:
import pyarrow.parquet as pq
import pyarrow.compute as pc

# Read as PyArrow table (zero-copy, fastest)
table = pq.read_table('changesets_debug.parquet')

# Schema shows proper types
#print(table.schema)
# year: int16
# imagery_used: list<element: string>
# all_tags: list<element: string>

# Fast filtering and aggregation
large_changesets = table.filter(pc.greater(table['edit_count'], 1000))
total_edits = pc.sum(table['edit_count']).as_py()

# Access list data
tags = table['all_tags'][0].as_py()  # Returns: ['comment', 'created_by']
tags

In [3]:
# use pandas to read the parquet file
import pandas as pd
df = pd.read_parquet("changesets_debug.parquet")
print(df.schema())
# print the first 5 rows
#print(df.head())

# print the column names


AttributeError: 'DataFrame' object has no attribute 'schema'