# EDA
Here, we'll conduct all of the EDA for this project.

## Next steps
1. Basic instructions visualizations
 - most common lines
 - dist of each lines' duration
 - compare number of steps 
 - breakdown by time of day

2. Breakdown by time of day for trip durations

In [295]:
import mysql.connector 
import config
from datetime import datetime
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import helpers
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [209]:
# Establish connection to MySQL db and connect to db `google_maps`
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.password
)
cursor = cnx.cursor()
cursor.execute("USE google_maps")

# Trip Duration

In [296]:
# Get all data after 2/18 and store as a df
cursor.execute("""SELECT * FROM trips WHERE DAY(departure_time) > 18;""")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,departure_time,trip_direction,trip_duration
0,2020/02/22 10:05,0,66
1,2020/02/22 10:05,1,55
2,2020/02/22 10:10,0,66
3,2020/02/22 10:10,1,58
4,2020/02/22 10:15,0,55


In [211]:
#inspect columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
departure_time    500 non-null object
trip_direction    500 non-null int64
trip_duration     500 non-null int64
dtypes: int64(2), object(1)
memory usage: 11.8+ KB


In [307]:
# Use helpers.py to engineer several features and convert departure_time to datetime
trip_df, trips_dfs = helpers.create_features(df)

In [308]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
departure_time         500 non-null datetime64[ns]
trip_direction         500 non-null int64
trip_duration          500 non-null int64
time_of_day            500 non-null object
day_of_week            500 non-null int64
weekday                500 non-null int64
trip_direction_text    500 non-null object
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 27.5+ KB


In [309]:
trip_df.head()

Unnamed: 0,departure_time,trip_direction,trip_duration,time_of_day,day_of_week,weekday,trip_direction_text
0,2020-02-22 10:05:00,0,66,Morning,5,0,Trip to gf
1,2020-02-22 10:05:00,1,55,Morning,5,0,Trip to me
2,2020-02-22 10:10:00,0,66,Morning,5,0,Trip to gf
3,2020-02-22 10:10:00,1,58,Morning,5,0,Trip to me
4,2020-02-22 10:15:00,0,55,Morning,5,0,Trip to gf


## Visualizations

In [305]:
fig = go.Figure()

for trip in trips_dfs:
    
    fig.add_trace(go.Box(
        y = trip.trip_duration,
        name = trip.trip_direction_text.values[0],
        boxpoints = 'all',
        jitter = 0.3
    )
    )

fig.update_layout(
    title = 'Transit Trip Duration',
    yaxis_title = 'Minutes',
    template = 'plotly_white',
    showlegend = False
)
fig.show()

We notice a couple things:
1. Trips to gf are on average longer. The min and max are also longer!
2. Trip times are discrete values from 47 - 70 minutes.

In [306]:
fig = go.Figure()

for trip in trips_dfs:

    fig.add_trace(go.Scatter(
        x = trip.departure_time,
        y = trip.trip_duration,
        name = trip['trip_direction_text'].values[0],
#         mode = 'markers'
    ))
    
fig.update_layout(
    title = 'Transit Trip Duration',
    yaxis_title = 'Minutes',
    xaxis_title = 'Departure Time',
    template = 'plotly_white'
)

fig.show()

# Trip Instructions

In [310]:
# Get all data after 2/18 and store as a df
cursor.execute("""SELECT * FROM instructions WHERE DAY(departure_time) > 18;""")
instr_df = pd.DataFrame(cursor.fetchall())
instr_df.columns = [x[0] for x in cursor.description]
instr_df.head()

Unnamed: 0,departure_time,trip_direction,trip_step,transit_line,step_duration
0,2020/02/22 10:05,0,1,L,4
1,2020/02/22 10:05,0,2,G,9
2,2020/02/22 10:05,0,3,E,23
3,2020/02/22 10:05,1,1,R,7
4,2020/02/22 10:05,1,2,Q59,22


In [311]:
instr_df, instr_dfs = helpers.create_features(instr_df)

In [312]:
instr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150 entries, 0 to 1149
Data columns (total 9 columns):
departure_time         1150 non-null datetime64[ns]
trip_direction         1150 non-null int64
trip_step              1150 non-null int64
transit_line           1150 non-null object
step_duration          1150 non-null int64
time_of_day            1150 non-null object
day_of_week            1150 non-null int64
weekday                1150 non-null int64
trip_direction_text    1150 non-null object
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 81.0+ KB


In [313]:
instr_df.head()

Unnamed: 0,departure_time,trip_direction,trip_step,transit_line,step_duration,time_of_day,day_of_week,weekday,trip_direction_text
0,2020-02-22 10:05:00,0,1,L,4,Morning,5,0,Trip to gf
1,2020-02-22 10:05:00,0,2,G,9,Morning,5,0,Trip to gf
2,2020-02-22 10:05:00,0,3,E,23,Morning,5,0,Trip to gf
3,2020-02-22 10:05:00,1,1,R,7,Morning,5,0,Trip to me
4,2020-02-22 10:05:00,1,2,Q59,22,Morning,5,0,Trip to me


In [269]:
# The no. of steps for each departure time
instr_df_to_gf.groupby(by='departure_time').count()

Unnamed: 0_level_0,trip_direction,trip_step,transit_line,step_duration,trip_direction_text,time_of_day
departure_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-22 10:05:00,3,3,3,3,3,3
2020-02-22 10:10:00,3,3,3,3,3,3
2020-02-22 10:15:00,2,2,2,2,2,2
2020-02-22 10:20:00,3,3,3,3,3,3
2020-02-22 10:25:00,3,3,3,3,3,3
...,...,...,...,...,...,...
2020-02-23 09:20:00,3,3,3,3,0,3
2020-02-23 09:25:00,3,3,3,3,0,3
2020-02-23 09:30:00,3,3,3,3,0,3
2020-02-23 09:35:00,3,3,3,3,0,3


In [208]:
cursor.close()
cnx.close()