# Google Data Analytics capstone project
Goal: analyze the bike rides dataset from https://divvy-tripdata.s3.amazonaws.com/index.html and present your findings.

## Set up the environment:

In [1]:
# Import packages
# standard
import datetime
import io
from urllib.parse import urljoin
import zipfile
# external
import geojson
import numpy
import plotly.graph_objects
import plotly.subplots
import pandas
import requests
import scipy.stats
import tqdm

In [3]:
# Define functions
def haversine(x: float) -> float:
    """Trigonometric haversine function."""
    return numpy.sin(x/2)**2


def haversine_distance(longitude1_deg: float, latitude1_deg: float, longitude2_deg: float,  latitude2_deg: float, radius_km: float = 6378) -> float:
    """
    Calculate distance between two geographical points given in degrees.
    https://en.wikipedia.org/wiki/Haversine_formula 
    """
    latitude1_rad, longitude1_rad, latitude2_rad, longitude2_rad = numpy.radians([
        latitude1_deg,
        longitude1_deg,
        latitude2_deg,
        longitude2_deg])

    latitude_difference = latitude1_rad - latitude2_rad
    longitude_difference = longitude1_rad - longitude2_rad
    
    haversine_theta = haversine(latitude_difference) + numpy.cos(latitude1_rad) * numpy.cos(latitude2_rad) * haversine(longitude_difference)
    distance = 2 * radius_km * numpy.arcsin( numpy.sqrt(haversine_theta) )
    return distance


def get_pdf_values(values: list | pandas.core.series.Series) -> tuple[numpy.ndarray, numpy.ndarray]:
    """
    Get numpy histogram with resolution based on interquantile range.
    Used to plot a probability density function (PDF).
    First value in return tuple gives the edges of bins (PDF x values).
    Second value in return tuple gives the number of data points in each bin (PDF y values).
    """
    # Set resolution
    n_bins_in_interquantile_range = 50

    values = pandas.Series(values)
    values_range = values.max() - values.min()
    values_interquantile_range = values.quantile(0.75) - values.quantile(0.25)

    n_bins = int(values_range / (values_interquantile_range / n_bins_in_interquantile_range))
    histogram = numpy.histogram(values, bins=n_bins, density=True)

    return (histogram[1], histogram[0])


def add_pdf_plot(
        figure: plotly.graph_objs._figure.Figure,
        x_values: (list | pandas.core.series.Series),
        y_values: (list | pandas.core.series.Series),
        color: str,
        cutoffs: tuple[float, float] = (0, 0),
        legend_name: str = None,
        subplot_row: int = None,
        subplot_column: int = None) -> plotly.graph_objs._figure.Figure:
    """
    Add PDF plot to plotly figure.
    Optionally also show cutoffs for outliers and use specified subplot.
    """
    pdf_plot = plotly.graph_objects.Scatter(
        x=x_values,
        y=y_values,
        mode="lines",
        name=legend_name,
        line=dict(
            color=color))

    figure = (
        figure
            .add_trace(
                pdf_plot,
                row=subplot_row,
                col=subplot_column)
            .add_vrect(
                x0=cutoffs[0],
                x1=cutoffs[1],
                fillcolor=color,
                opacity=0.2,
                line_width=0,
                row=subplot_row,
                col=subplot_column))
    return figure

## Download the data:

In [6]:
# Input
data_base_url = "https://divvy-tripdata.s3.amazonaws.com/"
start_month = "2022_01"
end_month = "2022_12"


# Download data
months = pandas.date_range(
    start=datetime.datetime.strptime(start_month, "%Y_%m"),
    # Insert a date from next month, because pandas.date_range with freq="M" gives last days of month
    end=datetime.datetime.strptime(end_month, "%Y_%m") + datetime.timedelta(days=32),
    freq="M")

file_names = [f"{datetime.datetime.strftime(month, '%Y%m')}-divvy-tripdata" for month in months]

data = pandas.DataFrame()
for file_name in tqdm.tqdm(file_names, desc="Downloading data"):
    response = requests.get(urljoin(data_base_url, f"{file_name}.zip"), allow_redirects=True)
    # ZipFile object accepts file-like objects
    # Use io.BytesIO to turn requests.Response.content to file-like object
    with zipfile.ZipFile(io.BytesIO(response.content)) as data_zipfile:
        zip_file_names = [zip_file.filename for zip_file in data_zipfile.filelist]
        # Some zipped files have a different naming convention
        if f"{file_name}.csv" not in zip_file_names:
            file_name = file_name.replace("tripdata", "publictripdata")
        with data_zipfile.open(f"{file_name}.csv") as data_raw_csv:
            data = pandas.concat([data, pandas.read_csv(data_raw_csv)])

Downloading data:: 100%|██████████| 12/12 [02:28<00:00, 12.39s/it]


## Add derived variables to data
Calculate ride durations to analyse if members have longer rides.

Calculate ride direct distances to analyse regular and electric bike usage.
Distances are direct line distances calculated by start and end coordinates, because there is no GPS trajectory data available.

In [8]:
# Add ride durations
data[["started_at", "ended_at"]] = data[["started_at", "ended_at"]].astype("datetime64[ns]")
data["duration_minutes"] = [round(timedelta.total_seconds() / 60, 2) for timedelta in data["ended_at"] - data["started_at"]]

# Add ride direct line distances
data["longitude_delta"] = abs(data["start_lng"] - data["end_lng"])
data["latitude_delta"] = abs(data["start_lat"] - data["end_lat"])
data["direct_distance_km"] = [round(x, 2) for x in map(haversine_distance, data["start_lng"], data["start_lat"], data["end_lng"], data["end_lat"])]

## Ride duration analysis
1. plot distance pdf-s
2. Bootstrap the effect