# 2.5A Data Wrangling for Dashboard

In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import os
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime as dt
from streamlit_keplergl import keplergl_static

In [2]:
folderpath = r'/Users/matthewjones/Documents/CareerFoundry/Data Visualization with Python/Achievement 2/NY-CitiBike/2. Data/Processed Data'

df = pd.read_pickle(os.path.join(folderpath, 'cleaned_nyc_bike_weather_data.pkl'))

In [3]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,end_lng,member_casual,date,avgTemp,trip_duration,month,season,value,bike_rides_daily,trip_duration_outliers
0,D09109AEB47FEEA9,Classic Bike,2022-01-01 16:31:02.241,2022-02-03 20:27:00.358,Picnic Point,4374.01,E 16 St & 5 Ave,6022.04,40.685126,-74.025353,...,-73.99239,Casual,2022-01-01,11.6,14158,1,Winter,1,20428,True
1,ECDFB9C97FAF0F35,Classic Bike,2022-01-01 13:38:37.927,2022-01-01 13:45:58.200,W 13 St & 7 Ave,6030.04,Great Jones St,5636.11,40.737815,-73.999947,...,-73.99379,Casual,2022-01-01,11.6,440,1,Winter,1,20428,False
2,22F675B5C5A666FD,Classic Bike,2022-01-01 00:18:07.680,2022-01-01 01:19:41.047,W 42 St & Dyer Ave,6644.07,West St & Chambers St,5329.03,40.758985,-73.9938,...,-74.013221,Casual,2022-01-01,11.6,3693,1,Winter,1,20428,True
3,39DBCC00E8BCCFCF,Classic Bike,2022-01-01 22:40:32.125,2022-01-01 22:46:53.380,8 Ave & W 27 St,6297.07,E 31 St & 3 Ave,6239.08,40.747968,-73.996637,...,-73.979661,Member,2022-01-01,11.6,381,1,Winter,1,20428,False
4,FB8B1C38D9B41889,Electric Bike,2022-01-01 14:46:57.116,2022-01-01 14:53:41.673,University Pl & E 14 St,5905.14,E 31 St & 3 Ave,6239.08,40.734814,-73.992085,...,-73.979661,Member,2022-01-01,11.6,404,1,Winter,1,20428,False


In [6]:
df1 = df.drop(columns = {'ride_id', 'started_at', 'ended_at', 'start_station_id',
                         'end_station_id', 'start_lat', 'start_lng', 'end_lat',
                         'end_lng', 'member_casual', 'month', 'trip_duration_outliers'})

In [7]:
df1.head()

Unnamed: 0,rideable_type,start_station_name,end_station_name,date,avgTemp,trip_duration,season,value,bike_rides_daily
0,Classic Bike,Picnic Point,E 16 St & 5 Ave,2022-01-01,11.6,14158,Winter,1,20428
1,Classic Bike,W 13 St & 7 Ave,Great Jones St,2022-01-01,11.6,440,Winter,1,20428
2,Classic Bike,W 42 St & Dyer Ave,West St & Chambers St,2022-01-01,11.6,3693,Winter,1,20428
3,Classic Bike,8 Ave & W 27 St,E 31 St & 3 Ave,2022-01-01,11.6,381,Winter,1,20428
4,Electric Bike,University Pl & E 14 St,E 31 St & 3 Ave,2022-01-01,11.6,404,Winter,1,20428


In [8]:
df1.shape

(29838166, 9)

### DATA WRANGLING

In [15]:
df_temp = df1[['date', 'avgTemp']]
df_temp = df_temp.groupby('date', as_index = False).agg(pd.Series.mode)

In [40]:
df_daily_rides = df1[['date', 'value']]
df_daily_rides = df_daily_rides.groupby('date', as_index = False).sum('value')
df_daily_rides = df_daily_rides.rename(columns = {'value': 'Daily Rides'})

In [39]:
df_classic_daily_rides = df1[['date', 'rideable_type', 'value']]
df_classic_daily_rides = df_classic_daily_rides.groupby(['date', 'rideable_type'], as_index = False).sum('value')
df_classic_daily_rides = df_classic_daily_rides.loc[df_classic_daily_rides['rideable_type'] == 'Classic Bike']
df_classic_daily_rides = df_classic_daily_rides.rename(columns = {'value': 'Daily Classic Rides'})

In [33]:
line_chart_data = df_temp.merge(df_daily_rides, on = 'date', how = 'left')

In [37]:
line_chart_data = line_chart_data.merge(df_classic_daily_rides[['date', 'Daily Classic Rides']], 
                                        on = 'date', how = 'left')

In [42]:
line_chart_data = line_chart_data.rename(columns = {'date' : 'Date', 'avgTemp' : 'Average Temperature'})

In [43]:
line_chart_data.head()

Unnamed: 0,Date,Average Temperature,Daily Rides,Daily Classic Rides
0,2022-01-01,11.6,20428,11399
1,2022-01-02,11.4,43009,26857
2,2022-01-03,1.4,33189,19583
3,2022-01-04,-2.7,36842,22476
4,2022-01-05,3.2,34230,20676


In [44]:
df_bar = df1[['start_station_name', 'end_station_name', 'rideable_type', 'season', 'value']]

In [49]:
df_bar_start = df_bar.groupby('start_station_name', as_index=False).sum('value')
df_top_start = df_bar_start.nlargest(20, 'value')
top_start_list = df_top_start['start_station_name'].tolist()

In [53]:
top_start_data = df_bar.loc[df_bar['start_station_name'].isin(top_start_list)]
grouped_start = top_start_data.groupby(['start_station_name', 'rideable_type', 'season'], 
                                       as_index=False).sum('value')
grouped_start = grouped_start.rename(columns = {'value' : 'Total'})

In [55]:
df_bar_end = df_bar.groupby('end_station_name', as_index=False).sum('value')
df_top_end = df_bar_end.nlargest(20, 'value')
top_end_list = df_top_end['end_station_name'].tolist()

In [62]:
top_end_data = df_bar.loc[df_bar['end_station_name'].isin(top_end_list)]
grouped_end = top_end_data.groupby(['end_station_name', 'rideable_type', 'season'], 
                                       as_index=False).sum('value')
grouped_end = grouped_end.rename(columns = {'value' : 'Total'})

In [65]:
grouped_start = grouped_start.merge(df_top_start[['start_station_name', 'value']], on='start_station_name', how='left')
grouped_end = grouped_end.merge(df_top_end[['end_station_name', 'value']], on='end_station_name', how='left')

In [67]:
grouped_start = grouped_start.rename(columns = {'value' : 'Grand Total'})
grouped_end = grouped_end.rename(columns = {'value' : 'Grand Total'})

In [68]:
grouped_start.head()

Unnamed: 0,start_station_name,rideable_type,season,Total,Grand Total
0,1 Ave & E 68 St,Classic Bike,Fall,14736,104856
1,1 Ave & E 68 St,Classic Bike,Spring,15016,104856
2,1 Ave & E 68 St,Classic Bike,Summer,19145,104856
3,1 Ave & E 68 St,Classic Bike,Winter,7940,104856
4,1 Ave & E 68 St,Electric Bike,Fall,14289,104856


In [69]:
grouped_end.head()

Unnamed: 0,end_station_name,rideable_type,season,Total,Grand Total
0,1 Ave & E 68 St,Classic Bike,Fall,14813,105121
1,1 Ave & E 68 St,Classic Bike,Spring,15068,105121
2,1 Ave & E 68 St,Classic Bike,Summer,19085,105121
3,1 Ave & E 68 St,Classic Bike,Winter,7786,105121
4,1 Ave & E 68 St,Electric Bike,Fall,14425,105121


In [75]:
df_payment = df[['member_casual', 'value']]

In [76]:
df_payment_pie = df_payment.groupby('member_casual').sum('value')

In [77]:
df_payment_pie

Unnamed: 0_level_0,value
member_casual,Unnamed: 1_level_1
Casual,6580665
Member,23257501


In [83]:
df_duration = df[['trip_duration', 'value', 'season', 'trip_duration_outliers']]

In [84]:
df_duration_clean = df_duration[~df_duration['trip_duration_outliers'] == True] 

In [85]:
df_duration_clean.head()

Unnamed: 0,trip_duration,value,season,trip_duration_outliers
1,440,1,Winter,False
3,381,1,Winter,False
4,404,1,Winter,False
5,1786,1,Winter,False
6,1758,1,Winter,False


In [105]:
np.random.seed(32)
red = np.random.rand(len(df_duration_clean)) <= 0.97

In [106]:
small = df_duration_clean[~red]

In [107]:
small.shape

(849848, 4)

### EXPORT DATA

In [70]:
line_chart_data.to_csv(os.path.join(folderpath, 'DB_line_chart_data.csv'), index=True)

In [71]:
grouped_start.to_csv(os.path.join(folderpath, 'DB_bar_chart_start.csv'), index=True)

In [72]:
grouped_end.to_csv(os.path.join(folderpath, 'DB_bar_chart_end.csv'), index=True)

In [86]:
df_payment_hist.to_csv(os.path.join(folderpath, 'DB_pie_payment.csv'), index=True)

In [108]:
small.to_csv(os.path.join(folderpath, 'DB_hist_duration.csv'), index=True)