In [149]:
import re
import polars as pl

In [150]:
df = pl.read_csv(r'./data/coaster_db.csv')
df.head()

coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,Height,Inversions,Lift/launch system,Cost,Trains,Park section,Duration,Capacity,G-force,Designer,Max vertical angle,Drop,Soft opening date,Fast Lane available,Replaced,Track layout,Fastrack available,Soft opening date.1,Closing date,Opened,Replaced by,Website,Flash Pass Available,Must transfer from wheelchair,Theme,Single rider line available,Restraint Style,Flash Pass available,Acceleration,Restraints,Name,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,f64,f64,str,str,str,str,f64,str,f64,f64,str,f64,i64,f64
"""Switchback Railway""","""600 ft (180 m)""","""6 mph (9.7 km/h)""","""Coney Island""","""Removed""","""June 16, 1884""","""Wood""","""LaMarcus Adna Thompson""",,"""Lift Packed""","""50 ft (15 m)""",,"""gravity""",,,"""Coney Island Cyclone Site""","""1:00""","""1600 riders per hour""","""2.9""","""LaMarcus Adna Thompson""","""30°""","""43 ft (13 m)""",,,,"""Gravity pulled coaster""",,,,,,,,,,,,,,,,1884,40.574,-73.978,"""Wood""","""1884-06-16""","""6 mph ""","""9.7 km/h""",6.0,"""mph""",6.0,50.0,"""ft""",,0,2.9
"""Flip Flap Railway""",,,"""Sea Lion Park""","""Removed""","""1895""","""Wood""","""Lina Beecher""",,,,1.0,,,"""a single car. Riders are arran…",,,,"""12""","""Lina Beecher""",,,,,,,,,"""1902""",,,,,,,,,,,,,1895,40.578,-73.979,"""Wood""","""1895-01-01""",,,,,,,,,1,12.0
"""Switchback Railway (Euclid Bea…",,,"""Cleveland, Ohio, United States""","""Closed""",,"""Other""",,,,,,,,,,,,,,,,,,,,,,,"""1895""",,,,,,,,,,,,1896,41.58,-81.57,"""Other""",,,,,,,,,,0,
"""Loop the Loop (Coney Island)""",,,"""Other""","""Removed""","""1901""","""Steel""","""Edwin Prescott""",,,,1.0,,,"""a single car. Riders are arran…",,,,,"""Edward A. Green""",,,,,"""Switchback Railway""",,,,"""1910""",,"""Giant Racer""",,,,,,,,,,,1901,40.5745,-73.978,"""Steel""","""1901-01-01""",,,,,,,,,1,
"""Loop the Loop (Young's Pier)""",,,"""Other""","""Removed""","""1901""","""Steel""","""Edwin Prescott""",,,,1.0,,,,,,,,"""Edward A. Green""",,,,,,,,,"""1912""",,,,,,,,,,,,,1901,39.3538,-74.4342,"""Steel""","""1901-01-01""",,,,,,,,,1,


### Filter columns


In [151]:
df = df[[
    'coaster_name', 'Location', 'Status', 'Type_Main', 'Manufacturer',
    'Length', 'Height', 'speed2', 'Duration', 'Capacity', 'Gforce_clean',
    'opening_date_clean', 'Closing date', 'latitude', 'longitude'
]]
df.head(1)

coaster_name,Location,Status,Type_Main,Manufacturer,Length,Height,speed2,Duration,Capacity,Gforce_clean,opening_date_clean,Closing date,latitude,longitude
str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,f64
"""Switchback Railway""","""Coney Island""","""Removed""","""Wood""","""LaMarcus Adna Thompson""","""600 ft (180 m)""","""50 ft (15 m)""","""9.7 km/h""","""1:00""","""1600 riders per hour""",2.9,"""1884-06-16""",,40.574,-73.978


### Rename columns


In [152]:
df = df.rename(lambda col: col.lower().replace(' ', '_'))
df = df.rename({
    'coaster_name': 'name',
    'opening_date_clean': 'opening_date',
    'closing_date': 'closing_year',
    'speed2': 'speed_in_kmh',
    'duration': 'duration_in_sec',
    'gforce_clean': 'g_force',
    'length': 'length_in_m',
    'height': 'height_in_m'
})
df.head(1)

name,location,status,type_main,manufacturer,length_in_m,height_in_m,speed_in_kmh,duration_in_sec,capacity,g_force,opening_date,closing_year,latitude,longitude
str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,f64
"""Switchback Railway""","""Coney Island""","""Removed""","""Wood""","""LaMarcus Adna Thompson""","""600 ft (180 m)""","""50 ft (15 m)""","""9.7 km/h""","""1:00""","""1600 riders per hour""",2.9,"""1884-06-16""",,40.574,-73.978


### Change status column's values


In [153]:
def change_status(status: str) -> str:
    temporary_closed = (
        'Temporarily closed', 'Temporarily Closed',
    )

    closed = (
        'Closed', 'Closed in 2021', 'Not Currently Operating',
        'Discontinued', 'Removed',
        'Chapter 7 bankruptcy; rides dismantled and sold; property sold'
    )

    sbno = (
        'SBNO (Standing But Not Operating)', 'SBNO December 2019'
    )

    under_maintenance = (
        'Under Maintenance',
        'closed for maintenance as of july 30 no reopening date known'
    )

    result = ''

    if status in temporary_closed:
        result = 'temporary_closed'
    elif status in closed:
        result = 'closed'
    elif status in sbno:
        result = 'sbno'
    elif status in under_maintenance:
        result = 'under_maintenance'
    elif status == 'Under construction':
        result = 'In Production'
    else:
        result = status

    return result.lower().replace(' ', '_')


print('Before:')
display(df['status'].unique().to_list())

df = df.with_columns(pl.col('status').fill_null('unknown'))
df = df.with_columns(pl.col('status').map_elements(
    change_status, return_dtype=pl.String).cast(pl.Categorical))

print('After:')
display(df['status'].unique().to_list())

Before:


['Operating',
 'closed for maintenance as of july 30 no reopening date known',
 'Not Currently Operating',
 'Under Maintenance',
 'Temporarily closed',
 'Closed',
 'Chapter 7 bankruptcy; rides dismantled and sold; property sold',
 'SBNO (Standing But Not Operating)',
 'SBNO December 2019',
 'Removed',
 'Closed in 2021',
 'Temporarily Closed',
 None,
 'Under construction',
 'In Production',
 'Discontinued']

After:


['closed',
 'operating',
 'unknown',
 'in_production',
 'under_maintenance',
 'sbno',
 'temporary_closed']

### Extract meter in length & height columns


In [154]:
def extract_meter(value: str) -> float:
    match = re.search(r"\(([\d.,]+)\s*m\)", value)

    if match:
        return float(match.group(1).replace(',', ''))
    else:
        return None


df = df.with_columns(
    pl.col("length_in_m").map_elements(extract_meter, return_dtype=pl.Float64),
    pl.col("height_in_m").map_elements(extract_meter, return_dtype=pl.Float64)
)
df.head(1)

name,location,status,type_main,manufacturer,length_in_m,height_in_m,speed_in_kmh,duration_in_sec,capacity,g_force,opening_date,closing_year,latitude,longitude
str,str,cat,str,str,f64,f64,str,str,str,f64,str,str,f64,f64
"""Switchback Railway""","""Coney Island""","""closed""","""Wood""","""LaMarcus Adna Thompson""",180.0,15.0,"""9.7 km/h""","""1:00""","""1600 riders per hour""",2.9,"""1884-06-16""",,40.574,-73.978


### Extract km/h in speed column


In [155]:
def extract_kmh(value: str) -> float:
    MPH_TO_KMH = 1.60934

    if "mp" in value:
        # Convert mph to km/h
        return float(re.search(r"[\d.]+", value).group()) * MPH_TO_KMH
    elif "km" in value:
        return float(re.search(r"[\d.]+", value).group())
    else:
        return None


df = df.with_columns(
    pl.col("speed_in_kmh").map_elements(extract_kmh, return_dtype=pl.Float64)
)
df.head(1)

name,location,status,type_main,manufacturer,length_in_m,height_in_m,speed_in_kmh,duration_in_sec,capacity,g_force,opening_date,closing_year,latitude,longitude
str,str,cat,str,str,f64,f64,f64,str,str,f64,str,str,f64,f64
"""Switchback Railway""","""Coney Island""","""closed""","""Wood""","""LaMarcus Adna Thompson""",180.0,15.0,9.7,"""1:00""","""1600 riders per hour""",2.9,"""1884-06-16""",,40.574,-73.978
