# EDA
Here, we'll conduct all of the EDA for this project.

## Next steps
1. Basic instructions visualizations
 - dist of each lines' duration

2. Breakdown by time of day for trip durations

In [295]:
import mysql.connector 
import config
from datetime import datetime
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import helpers
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [440]:
# Establish connection to MySQL db and connect to db `google_maps`
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.password
)
cursor = cnx.cursor()
cursor.execute("USE google_maps")

# Trip Duration

In [441]:
# Get all data after 2/18 and store as a df
cursor.execute("""SELECT * FROM trips WHERE DAY(departure_time) > 18;""")
df = pd.DataFrame(cursor.fetchall())
df.columns = [x[0] for x in cursor.description]
df.head()

Unnamed: 0,departure_time,trip_direction,trip_duration
0,2020/02/22 10:05,0,66
1,2020/02/22 10:05,1,55
2,2020/02/22 10:10,0,66
3,2020/02/22 10:10,1,58
4,2020/02/22 10:15,0,55


In [442]:
df.tail()

Unnamed: 0,departure_time,trip_direction,trip_duration
1599,2020/02/25 13:33,1,55
1600,2020/02/25 13:38,0,54
1601,2020/02/25 13:38,1,55
1602,2020/02/25 13:43,0,56
1603,2020/02/25 13:43,1,44


In [413]:
#inspect columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1192 entries, 0 to 1191
Data columns (total 7 columns):
departure_time         1192 non-null datetime64[ns]
trip_direction         1192 non-null int64
trip_duration          1192 non-null int64
time_of_day            1192 non-null object
day_of_week            1192 non-null int64
weekday                1192 non-null int64
trip_direction_text    1192 non-null object
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 65.3+ KB


In [443]:
# Use helpers.py to engineer several features and convert departure_time to datetime
trip_df, trips_dfs = helpers.create_features(df)

In [406]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132 entries, 0 to 1131
Data columns (total 7 columns):
departure_time         1132 non-null datetime64[ns]
trip_direction         1132 non-null int64
trip_duration          1132 non-null int64
time_of_day            1132 non-null object
day_of_week            1132 non-null int64
weekday                1132 non-null int64
trip_direction_text    1132 non-null object
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 62.0+ KB


In [394]:
trip_df.head()

Unnamed: 0,departure_time,trip_direction,trip_duration,time_of_day,day_of_week,weekday,trip_direction_text
0,2020-02-22 10:05:00,0,66,Morning,5,0,Trip to gf
1,2020-02-22 10:05:00,1,55,Morning,5,0,Trip to me
2,2020-02-22 10:10:00,0,66,Morning,5,0,Trip to gf
3,2020-02-22 10:10:00,1,58,Morning,5,0,Trip to me
4,2020-02-22 10:15:00,0,55,Morning,5,0,Trip to gf


## Visualizations

In [444]:
fig = go.Figure()

for trip in trips_dfs:
    
    fig.add_trace(go.Box(
        y = trip.trip_duration,
        name = trip.trip_direction_text.values[0],
#         boxpoints = 'all',
#         jitter = 0.3
    )
    )

fig.update_layout(
    title = 'Transit Trip Duration',
    yaxis_title = 'Minutes',
    template = 'plotly_white',
    showlegend = False
)
fig.show()

We notice a couple things:
1. Trips to gf are on average longer. The min and max are also longer!
2. Trip times are discrete values from 47 - 70 minutes.

In [445]:
fig = go.Figure()

for trip in trips_dfs:

    fig.add_trace(go.Scatter(
        x = trip.departure_time,
        y = trip.trip_duration,
        name = trip['trip_direction_text'].values[0],
#         mode = 'markers'
    ))
    
fig.update_layout(
    title = 'Transit Trip Duration',
    yaxis_title = 'Minutes',
    xaxis_title = 'Departure Time',
    template = 'plotly_white'
)

fig.show()

# Trip Instructions

In [446]:
# Get all data after 2/18 and store as a df
cursor.execute("""SELECT * FROM instructions WHERE DAY(departure_time) > 18;""")
instr_df = pd.DataFrame(cursor.fetchall())
instr_df.columns = [x[0] for x in cursor.description]
instr_df.head()

Unnamed: 0,departure_time,trip_direction,trip_step,transit_line,step_duration
0,2020/02/22 10:05,0,1,L,4
1,2020/02/22 10:05,0,2,G,9
2,2020/02/22 10:05,0,3,E,23
3,2020/02/22 10:05,1,1,R,7
4,2020/02/22 10:05,1,2,Q59,22


In [459]:
instr_df, instr_dfs = helpers.create_features(instr_df)

In [432]:
instr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3605 entries, 0 to 3604
Data columns (total 9 columns):
departure_time         3605 non-null datetime64[ns]
trip_direction         3605 non-null int64
trip_step              3605 non-null int64
transit_line           3605 non-null object
step_duration          3605 non-null int64
time_of_day            3605 non-null object
day_of_week            3605 non-null int64
weekday                3605 non-null int64
trip_direction_text    3605 non-null object
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 253.6+ KB


In [433]:
instr_df.head()

Unnamed: 0,departure_time,trip_direction,trip_step,transit_line,step_duration,time_of_day,day_of_week,weekday,trip_direction_text
0,2020-02-22 10:05:00,0,1,L,4,Morning,5,0,Trip to gf
1,2020-02-22 10:05:00,0,2,G,9,Morning,5,0,Trip to gf
2,2020-02-22 10:05:00,0,3,E,23,Morning,5,0,Trip to gf
3,2020-02-22 10:05:00,1,1,R,7,Morning,5,0,Trip to me
4,2020-02-22 10:05:00,1,2,Q59,22,Morning,5,0,Trip to me


In [478]:
# The no. of steps for each departure time for to my gf's apartment
instr_to_gf_transfers = instr_dfs[0].groupby(by='departure_time')['trip_direction'].count() -1

# The no. of steps for each departure time for trips to my apartment
instr_to_me_transfers = instr_dfs[1].groupby(by='departure_time')['trip_direction'].count() -1

In [508]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = instr_to_gf_transfers.values,
    name = 'Trips to me'
))

fig.add_trace(go.Histogram(
    x = instr_to_me_transfers.values,
    name = 'Trips to gf'
))

fig.update_layout(
    title = 'Number of Transfers per Trip',
    xaxis_title = 'No. of Transfers',
    template = 'plotly_white'
)

In [436]:
transit_lines = instr_df_to_gf.groupby(by='transit_line').count().sort_values(by='trip_direction',ascending=False).iloc[:,0]

In [437]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x=transit_lines.index,
    y=transit_lines.values
))

fig.update_layout(
    title = 'Transit Line Recommendation Frequency',
    yaxis_title = 'No. of Times Recommended to Take',
    xaxis_title = 'Transit Line (Subway or Bus)',
    template = 'plotly_white'
)

fig.show()

In [488]:
trips_dfs[0].weekday.value_counts()

0    417
1    385
Name: weekday, dtype: int64

In [491]:
morning_df = instr_dfs[0][instr_dfs[0]['time_of_day'] == 'Morning']
afternoon_df = trips_dfs[0][trips_dfs[0]['time_of_day'] == 'Afternoon']
evening_df = trips_dfs[0][trips_dfs[0]['time_of_day'] == 'Afternoon']
early_morn_df = afternoon_df = trips_dfs[0][trips_dfs[0]['time_of_day'] == 'Early Morning']

weekday_df = trips_dfs[0][trips_dfs[0]['weekday'] == 1]
weekend_df = trips_dfs[0][trips_dfs[0]['weekday'] == 0]

In [492]:
morning_df

Unnamed: 0,departure_time,trip_direction,trip_step,transit_line,step_duration,time_of_day,day_of_week,weekday,trip_direction_text
0,2020-02-22 10:05:00,0,1,L,4,Morning,5,0,Trip to gf
1,2020-02-22 10:05:00,0,2,G,9,Morning,5,0,Trip to gf
2,2020-02-22 10:05:00,0,3,E,23,Morning,5,0,Trip to gf
5,2020-02-22 10:10:00,0,1,L,4,Morning,5,0,Trip to gf
6,2020-02-22 10:10:00,0,2,G,9,Morning,5,0,Trip to gf
...,...,...,...,...,...,...,...,...,...
3845,2020-02-25 11:49:00,0,1,L,14,Morning,1,1,Trip to gf
3846,2020-02-25 11:49:00,0,2,E,29,Morning,1,1,Trip to gf
3849,2020-02-25 11:55:00,0,1,L,4,Morning,1,1,Trip to gf
3850,2020-02-25 11:55:00,0,2,G,8,Morning,1,1,Trip to gf


In [439]:
cursor.close()
cnx.close()