# Step 0: Imports and Reading Data


In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data Understanding

In [28]:
df = pd.read_csv('./coaster_db.csv')


In [29]:
df.shape

(1087, 56)

In [30]:
df.head()


Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,...,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,...,,,,,,,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,
3,Loop the Loop (Coney Island),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,
4,Loop the Loop (Young's Pier),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,


In [33]:
df.columns

Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean'],
      dtype='object')

In [34]:
df.describe()

Unnamed: 0,Inversions,year_introduced,latitude,longitude,speed1_value,speed_mph,height_value,height_ft,Inversions_clean,Gforce_clean
count,932.0,1087.0,812.0,812.0,937.0,937.0,965.0,171.0,1087.0,362.0
mean,1.54721,1994.986201,38.373484,-41.595373,53.850374,48.617289,89.575171,101.996491,1.326587,3.824006
std,2.114073,23.475248,15.516596,72.285227,23.385518,16.678031,136.246444,67.329092,2.030854,0.989998
min,0.0,1884.0,-48.2617,-123.0357,5.0,5.0,4.0,13.1,0.0,0.8
25%,0.0,1989.0,35.03105,-84.5522,40.0,37.3,44.0,51.8,0.0,3.4
50%,0.0,2000.0,40.2898,-76.6536,50.0,49.7,79.0,91.2,0.0,4.0
75%,3.0,2010.0,44.7996,2.7781,63.0,58.0,113.0,131.2,2.0,4.5
max,14.0,2022.0,63.2309,153.4265,240.0,149.1,3937.0,377.3,14.0,12.0


# Step 2: Data Prepartion

In [38]:
df= df[['coaster_name', 
    #             'Length', 'Speed', 
                'Location',
               #   'Status', 'Opening date',
    #    'Type'
        'Manufacturer', 
    #    'Height restriction', 'Model', 'Height',
    #    'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
    #    'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    #    'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    #    'Track layout', 'Fastrack available', 'Soft opening date.1',
    #    'Closing date'
    #    , 'Opened',
    #     #  'Replaced by', 'Website',
    #    'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
    #    'Single rider line available', 'Restraint Style',
    #    'Flash Pass available', 'Acceleration', 'Restraints', 'Name',\

       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 
    #    'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph',
        #  'height_value', 'height_unit',
           'height_ft',
       'Inversions_clean', 'Gforce_clean']].copy()

In [40]:
df.head()
df.shape


(1087, 12)

In [None]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

In [46]:
# Rename our columns
df.rename(columns={'coaster_name' : 'Coaster_Name','year_introduced':'Year_Introduced',
                   'opening_date_clean':'Opening_Date',
                   'speed_mph':'Speed_mph',
                   'height_ft':'Height_ft',
                   'Inversions_clean':'Inversions',
                   'Gforce_clean':'Gforce'})

Unnamed: 0,Coaster_Name,Location,Manufacturer,Year_Introduced,latitude,longitude,Type_Main,Opening_Date,Speed_mph,Height_ft,Inversions,Gforce
0,Switchback Railway,Coney Island,LaMarcus Adna Thompson,1884,40.5740,-73.9780,Wood,1884-06-16,6.0,,0,2.9
1,Flip Flap Railway,Sea Lion Park,Lina Beecher,1895,40.5780,-73.9790,Wood,1895-01-01,,,1,12.0
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",,1896,41.5800,-81.5700,Other,NaT,,,0,
3,Loop the Loop (Coney Island),Other,Edwin Prescott,1901,40.5745,-73.9780,Steel,1901-01-01,,,1,
4,Loop the Loop (Young's Pier),Other,Edwin Prescott,1901,39.3538,-74.4342,Steel,1901-01-01,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
1082,American Dreier Looping,Other,Anton Schwarzkopf,2022,,,Steel,NaT,53.0,,3,4.7
1083,Pantheon (roller coaster),Busch Gardens Williamsburg,Intamin,2022,37.2339,-76.6426,Steel,2022-01-01,73.0,,2,
1084,Tron Lightcycle Power Run,Other,Vekoma,2022,,,Steel,2016-06-16,59.3,,0,4.0
1085,Tumbili,Kings Dominion,S&S – Sansei Technologies,2022,,,Steel,NaT,34.0,,0,


In [48]:
df.isna().sum()


coaster_name            0
Location                0
Manufacturer           59
year_introduced         0
latitude              275
longitude             275
Type_Main               0
opening_date_clean    250
speed_mph             150
height_ft             916
Inversions_clean        0
Gforce_clean          725
dtype: int64

In [49]:
df = df.loc[~df.duplicated(subset=['Coaster_Name','Location','Opening_Date'])] \
    .reset_index(drop=True).copy()

KeyError: Index(['Opening_Date', 'Coaster_Name'], dtype='object')