In [None]:
from IPython.display import HTML

HTML("""
<style>
    #notebook-container {
    width: 80% !important;
}
.dataTable {
    width: 100% !important;
}
.dataTables_scrollBody {
    width: 100% !important;
}
.dataTables_scrollHead {
    width: 100% !important;
}
.dataTables_filter {
   float: right !important;
}
.output_html {
    max-width: calc(100%) !important;
}
.rendered{
    font-size: 125%;
}
</style>
""")

In [None]:
html1 = """
    <img src="https://bartable.bart.gov/sites/default/files/styles/body_width_/public/bike%20the%20bridge.jpg" style="margin-left:auto; margin-right:auto"/>
"""
HTML(html1)

# GoBike Data Analysis

## Goals
In this analysis, I aim to uncover any patterns, trends, or insights from a spreadsheet containing 519,000+ rows of data pertaining to bike rentals. The data contains basic information like:
* Start & End times
* Start & End locations & IDs
* Bike IDs
* User information - gender, birth year, customer type

## Some Questions Worth Answering
* Customer demographics
    * What age group uses bikes more often?
    * Which gender rents more bikes?
* Ride statistics
    * Average ride length?
    * When were the most popular times to rent a bike?
* Location demographics
    * Do any stations see more traffic?
    * What are the least used stations?

In [None]:
#IMPORT TOOLS & LIBRARIES
import pandas as pd
import numpy as np
import seaborn as sns
import os
import itables
from plotly.subplots import make_subplots

import func

# Data Exploration

In [None]:
# #read in data
output = pd.read_csv('goBike.csv', chunksize=100000)

df = pd.DataFrame()
for chunk in output:
    df = pd.concat([df, chunk])
    
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.groupby(['member_gender']).size()

### Observations
* **Duration**: The average bike ride was ~1100 seconds, or just over 18 minutes
* **Birth Year**: The average birth year is 1980, and 75% of customers were born before 1988
    * Outliers: The oldest birth year is reported as 1886 - user error?
* **Gender**: There are about 3.5x as many males as there are females/others

### Checking for Missing Data
As we can see below, two columns are missing ~12.8% of the data. For simplicity's sake, I removed these entries.

In [None]:
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])
func.calc_missing(df)

In [None]:
# drops all rows that have at least one NaN
df.dropna(inplace=True)

## Customer Demographics
### Single-Time Users vs. GoBike Subscribers

In [None]:
dfu = df.groupby(['user_type', 'member_gender']).size().unstack().reset_index()
dfu = dfu.iloc[:,[0,2,1,3]]
title = "Customer vs. Subscriber Gender Distribution"
sub = "There are far more male subscribers than any other group"
func.plot_gender_dist(dfu, title, sub)

### Age & Gender Distribution

In [None]:
df['member_age'] = df['start_time'].dt.year - df['member_birth_year']
df = df[df['member_age'] <= 100] # removes outliers
df["age_group"] = pd.cut(x=df['member_age'],
                         bins=[18,25,35,45,55,65,75,85,95,130], 
                         labels=["18-24","25-34","35-44","45-54","55-64","65-74","74-84","85-94","95+"])
dfa = df[['member_gender', 'age_group']]

title = "User Age & Gender Distribution"
sub = "Males getween the ages of 25-44 make up the overwhelming majority of users"
func.plot_gender_age(dfa, title, sub)

### Age & Gender vs. Ride Duration

In [None]:
df = df[df['member_age'] <= 100] # removes outlier
df['duration_minute'] = df['duration_sec'] // 60 # calculate ride minutes

# aggregates mean and count by member_age
dfr = (df.groupby(['member_gender', 'member_age'])
       .agg(avg_duration=('duration_minute', np.mean),
           num_riders=('member_age', np.size))
        .reset_index())

title = "Age & Gender vs. Ride Duration"
sub = "Women tend to ride bikes longer than men across most age groups"
func.plot_gender_dur(dfr, title, sub)

### Observations
* Men between 25-44 are the main customers
* People between 30s-50s tend ride for 12-15 minutes - commuting to/from work?
* Women tend to have longer ride durations compared to men
* Younger folk and seniors have more varied ride durations - less commitments, more free time, etc.

## Date & Time
### Rides by Month/Time

In [None]:
print(f'Earliest date: {df["start_time"].min()}')
print(f'  Latest date: {df["start_time"].max()}')

After grouping data by the hour, I decided to bin the hours like so:
* Early Morning: 3:00 - 5:59am
* Morning: 6:00 - 11:59am
* Afternoon: 12:00pm - 4:59pm
* Evening: 5:00 - 8:59pm
* Night: 9:00 - 11:59pm
* Late Night: 12:00am - 2:59am

In [None]:
dfp = (df.groupby([df['start_time'].dt.month, df['start_time'].dt.hour])['duration_sec'].count()
           .unstack(0)
           .fillna(0)
           .T) 
title = "Total Amounts of Rides by Month"
sub = "As temperatures cool down, the number of rides increases - until winter"
func.plot_month(dfp, title, sub)

### Observations
* Fall seems to be the most popular time to rent bikes - temperatures cool down, but not too chilly yet
* Mornings and evenings see the most rentals for each month - probably due to commute?
* Very few night/late night rentals - weather & safety concerns?

## Location Statistics
### Most Used Stations

In [None]:
sub = df[['start_station_latitude', 'start_station_longitude', 
          'end_station_latitude', 'end_station_longitude', 
          'start_station_name', 'end_station_name']]
dfs = func.comb_stations(sub)
title = "Map of Most Used Bike Stations"
sub = "Stations closer to the water or BART stations were more popular"
func.plot_stations(dfs, title, sub)

### Most Common Pairs of Start-End Stations

In [None]:
dfl = (df.groupby(['start_station_name', 'end_station_name'])
       .size()
       .to_frame(name='Count')
       .sort_values('Count', ascending=False)
       .reset_index()
       .iloc[:20,:]
      )

title = "Most Common Start-End Pairs"
sub = "The Ferry Building to The Embarcadero is a path along the water"
func.plot_station_pairs(dfl, title, sub)

### Observations
* Most of the traffic is in San Francisco - Market Street
    * More rentals took place near BART stations, and the water - tourism?
    * Ferry Building -> The Embarcadero
* Oakland saw significantly less rentals - maybe due to locals and less tourists?

# Summary
* People between 25-44 tend to rent bikes more often, but ride them for shorter durations.
* People outside that age range tend to rent bikes less often, but ride them for longer durations.
* Women typically ride bikes for longer durations compared to men.
* Mornings and evenings see the most rentals, probably due to people's work commutes.
* More bike rentals occur as summer becomes fall, but slowly drop as fall becomes winter.
* The bike stations closer to major points of interest (BART stations, popular tourist areas) see more rentals.

# Possible Next Steps
* Further explore relationship between gender, age, and duration
    * More granular analysis? Breakdown by weekday?
* Pull in weather data and see if correlation exists between rentals
* Analyze start-end paths for genders?