In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Introduction to EDA**

## Step 0: Importing and reading Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

In [None]:
df = pd.read_csv('/kaggle/input/rollercoaster-database/coaster_db.csv')

## Step 1: Data understanding 
* Dataframe shape
* head and tail
* dtypes
* describe

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

## Step 2: Data Preparation
* Dropping irrelevant columns and rows
* Indentifying duplicated columns
* Renaming columns
* Feature creation

In [None]:
#Example of dropping cols.
df.drop(['Opening date'], axis=1)

In [None]:
df = df[['coaster_name', #'Length', 'Speed', 
    'Location', 'Status', 
    #'Opening date',
       #'Type', 
    'Manufacturer', #'Height restriction', 'Model', 'Height',
       #'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       #'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       #'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       #'Track layout', 'Fastrack available', 'Soft opening date.1',
      # 'Closing date', 
   # 'Opened', 'Replaced by', 'Website',
      # 'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
      # 'Single rider line available', 'Restraint Style',
       #'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 
    #'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 
    #'height_value', 'height_unit', 
    'height_ft',
       'Inversions_clean', 'Gforce_clean']].copy()

In [None]:
df.shape

In [None]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

In [None]:
# rename cols.
#show col.s  ->  df.columns
df = df.rename(columns={'coaster_name':'Coaster_Name',
                    'year_introduced':'Year_Introduced',
                    'opening_date_clean':'Opening_Date',
                    'speed_mph':'Speed_mph',
                    'height_ft':'Height_ft',
                    'Inversions_clean':'Inversions',
                    'Gforce_clean':'Gforce'})

In [None]:
#missing values
df.isna().sum()

In [None]:
#duplicate data
df.loc[df.duplicated()]

In [None]:
df.loc[df.duplicated(subset=['Coaster_Name'])].head(5)

In [None]:
#checking an ex. of duplicate using query
df.query('Coaster_Name=="Crystal Beach Cyclone"')

In [None]:
df.columns

In [None]:
#here we checked duplicated for multiple rows 
#and inversed using '~', 
#then reset_index to new df

df = df.loc[~df.duplicated(subset=['Coaster_Name','Location','Opening_Date'])].reset_index(drop=True).copy()

## Step 3 : Feature Understanding
(univariate analysis)

* Plotting Feature Distributions
* Histograms
* KDE
* Boxplot

In [None]:
df['Year_Introduced'].value_counts()
#What years had the highest and lowest no. of rollercoasters introduced

In [None]:
# save as matplotlib axis 

ax = df['Year_Introduced'].value_counts().head(10)\
.plot(kind='bar', title='Top 10 Years Coasters Introduced')

ax.set_xlabel('Year Introduced')
ax.set_ylabel('Count')

In [None]:
df['Speed_mph'].max()

In [None]:
ax = df['Speed_mph'].plot(kind='hist', 
                     bins=10, 
                     title = 'Coaster speed (mph)')

ax.set_xlabel('Speed (mph)')

In [None]:
ax = df['Speed_mph'].plot(kind='kde', 
                     title = 'Coaster speed (mph)')
#kde = 'kernel density plot'

ax.set_xlabel('Speed (mph)')

In [None]:
status_count_df = df['Status'].value_counts()
status_count_df[status_count_df >2].plot(kind='pie', title= 'Current Status of Rollercoasters')

In [None]:
df.Type_Main.value_counts().plot(kind='bar', title= 'Main Type of construction')

In [None]:
df.Height_ft.plot(kind='hist', bins=20, title='Height(ft) of Roller coasters')
plt.show()

## Step 4 : Feature Relationships
* Scatterplot
* Heatmap Correlation
* Pairplot
* Groupby comparisions

In [None]:
df.columns

In [None]:
df.plot(kind='scatter',x='Speed_mph',y='Height_ft', title='Coaster Speed vs Height')
plt.show()

In [None]:
df.head(1)

In [None]:
ax1 = sns.scatterplot(x='Speed_mph',y='Height_ft',data=df,
               hue ='Year_Introduced')

ax1.set_title('Coaster Speed vs Height')
plt.show()

In [None]:
pd.option_context('mode.use_inf_as_na', True)
sns.pairplot(df, vars=['Year_Introduced','Speed_mph','Height_ft','Inversions','Gforce'],
            hue = 'Type_Main')
plt.show()

In [None]:
df_corr = df[['Year_Introduced','Speed_mph','Height_ft','Inversions','Gforce']].dropna().corr()
df_corr

In [None]:
sns.heatmap(df_corr, annot=True)

## Step 5 : Ask questions about the Data
* Try to answer a question you have about the data using a plot or statistic

What are the locations with the fastest rc(min of 10 rc at loc.)?


In [None]:
df['Location'].value_counts()


In [None]:
loc_stat = df.query('Location!="Other"').groupby('Location')['Speed_mph']\
            .agg(['mean','count']).query('count>=10').sort_values('mean')

ax = loc_stat['mean'].plot(kind='barh', figsize=(8,5), title='Average Coaster speed by Location')

ax.set_xlabel('Average Coaster speed')
plt.show()