In [164]:
import polars as pl
import altair as alt
from vega_datasets import data

In [165]:
df = pl.read_ipc('./data/coaster_db-cleaned.arrow')
df

name,location,status,material,manufacturer,length_in_m,height_in_m,speed_in_kmh,duration,capacity,g_force,opening_date,closing_year,latitude,longitude
str,str,cat,cat,str,f64,f64,f64,duration[ms],i64,f64,date,i16,f64,f64
"""Switchback Railway""","""Coney Island""","""closed""","""wood""","""LaMarcus Adna Thompson""",180.0,15.0,9.7,1m,1600,2.9,1884-06-16,,40.574,-73.978
"""Flip Flap Railway""","""Sea Lion Park""","""closed""","""wood""","""Lina Beecher""",,,,,,12.0,1895-01-01,1902,40.578,-73.979
"""Switchback Railway (Euclid Bea…","""Cleveland, Ohio, United States""","""closed""","""other""",,,,,,,,,,41.58,-81.57
"""Loop the Loop (Coney Island)""","""Other""","""closed""","""steel""","""Edwin Prescott""",,,,,,,1901-01-01,1910,40.5745,-73.978
"""Loop the Loop (Young's Pier)""","""Other""","""closed""","""steel""","""Edwin Prescott""",,,,,,,1901-01-01,1912,39.3538,-74.4342
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Ice Breaker (roller coaster)""","""SeaWorld Orlando""","""in_production""","""steel""","""Premier Rides""",580.0,28.0,84.0,,,,2022-02-01,,28.4088,-81.4633
"""Leviathan (Sea World)""","""Sea World""","""in_production""","""wood""","""Martin & Vleminckx""",,,79.984198,,720,,2022-01-01,,-27.9574,153.4263
"""Pantheon (roller coaster)""","""Busch Gardens Williamsburg""","""in_production""","""steel""","""Intamin""",1014.0,54.0,117.0,,,,2022-01-01,,37.2339,-76.6426
"""Tumbili""","""Kings Dominion""","""in_production""","""steel""","""S&S – Sansei Technologies""",230.0,34.0,55.0,55s,,,,,,


### Analyze material used


In [166]:
alt.Chart(df.group_by('material').len()).mark_arc().encode(
    theta=alt.Theta(field='len', type='quantitative'),
    color=alt.Color(field='material', type='nominal'),
    tooltip=['material', 'len']
)

### Analyze status


In [167]:
alt.Chart(df.group_by('status').len()).mark_bar().encode(
    x=alt.X(field='status', type='nominal'),
    y=alt.Y(field='len', type='quantitative'),
    tooltip=['status', 'len'],
    color=alt.Color(field='status', type='nominal')
).properties(width=500)

### Show the top 10 manufacturers who have the largest number of roller coasters


In [168]:
dfx = df.group_by('manufacturer').len()

# Sort, trim to only top 10, & add rank number
dfx = dfx.sort('len', descending=True)[:10].with_row_index('#', 1)

alt.Chart(dfx).mark_bar().encode(
    x=alt.X(field='len', type='quantitative'),
    y=alt.Y(field='manufacturer', type='nominal', sort='-x'),
    tooltip=['#', 'manufacturer', 'len'],
    color=alt.Color(field='manufacturer', type='nominal'),
).properties(width=500, height=300)

### Analyze roller coaster openings by decade and month


In [169]:
months = [
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
]

dfx = df.select(
    ((pl.col("opening_date").dt.year() // 10 *
     10).cast(pl.String) + 's').alias("decade"),
    pl.col('opening_date').dt.month().map_elements(
        lambda x: months[x-1], return_dtype=pl.String).alias('month')
)

decade_agg_df = dfx.group_by('decade').agg(
    pl.col('decade').len().alias('count')
)

month_agg_df = dfx.group_by('month').agg(
    pl.col('month').len().alias('count')
)

# Sort month_agg_df by month
month_agg_df = month_agg_df.with_columns(pl.col('month').map_elements(
    lambda x: months.index(x)+1, return_dtype=pl.Int8).alias('order')
).sort('order').drop('order')


def display_chart(df: pl.DataFrame, title: str, sort_x: bool):
    display(
        alt.Chart(df).mark_line(strokeWidth=2.5).encode(
            x=alt.X(
                field=df.columns[0], type='nominal',
                sort='x' if sort_x else None
            ),
            y=alt.Y(field=df.columns[1], type='quantitative')
        )
        +
        alt.Chart(df).mark_circle(size=60).encode(
            x=alt.X(
                field=df.columns[0], type='nominal',
                sort='x' if sort_x else None
            ),
            y=alt.Y(field=df.columns[1], type='quantitative'),
            tooltip=[df.columns[0], df.columns[1]]
        ).properties(
            title=title,
            width=600,
            height=300
        )
    )


display_chart(decade_agg_df, 'Roller Coaster Openings by Decade', True)
display_chart(month_agg_df, 'Roller Coaster Openings by Month', False)

### Analyze roller coaster durations by minute


In [170]:
# Get minute (floor)
dfx = df.select(
    (
        pl.col('duration').map_elements(
            lambda x: int(x.total_seconds() // 60), return_dtype=pl.Int64)
        .cast(pl.String) + 's'
    ).
    alias('minute')
)

# Aggregate
dfx = dfx.group_by('minute').agg(pl.col('minute').len().alias('count'))

(
    dfx.plot.scatter(x='minute', y='count')
    +
    dfx.plot.line(x='minute', y='count').properties(
        width=600, height=300, title='Roller Coaster Durations by Minute'
    )
)

### Geospatial Map


In [171]:
countries = alt.topo_feature(data.world_110m.url, 'countries')

dfx = (
    df[['name', 'location', 'status', 'manufacturer', 'latitude', 'longitude']]
    .drop_nulls(subset=['latitude', 'longitude'])
)

map = alt.Chart(countries).mark_geoshape(
    fill='mediumaquamarine',
    stroke='white'
).project(
    "equirectangular"
).properties(
    width=1000,
    height=500
)

selection_point = alt.selection_point()

map + alt.Chart(dfx).mark_circle(size=30).encode(
    latitude=alt.Latitude(field='latitude', type='quantitative'),
    longitude=alt.Longitude(field='longitude', type='quantitative'),
    tooltip=[
        'name', 'location', 'status',
        'manufacturer', 'latitude', 'longitude'
    ],
    color=alt.condition(selection_point, 'status', alt.value('lightgray'))
).add_params(
    selection_point
)