# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [2]:
# all import statements needed for the project, for example:

import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db

In [3]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [6]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### 1. Downloading Parquet Files

In [26]:

import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import datetime

def get_parquet_links(url, regex_pattern):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a', href=True)]
    return [link for link in links if re.search(regex_pattern, link)]

def download_parquet_files(links, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    for link in links:
        file_name = os.path.join(save_dir, os.path.basename(link))
        response = requests.get(link, stream=True)
        with open(file_name, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded: {file_name}")

# Filter links in date range
def filter_links_by_date(links, start_date, end_date):

    filtered_links = []
    for link in links:
        match = re.search(r'(\d{4})-(\d{2})', link)
        if match:
            year, month = int(match.group(1)), int(match.group(2))
            date = datetime(year, month, 1)
            if start_date <= date <= end_date:
                filtered_links.append(link)
    return filtered_links

# Define date range
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 8, 1)

url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
yellow_regex = r'yellow_tripdata_\d{4}-\d{2}\.parquet'
fhvhv_regex = r'fhvhv_tripdata_\d{4}-\d{2}\.parquet'

# Fetch links
yellow_links = get_parquet_links(url, yellow_regex)
fhvhv_links = get_parquet_links(url, fhvhv_regex)

# Filter links by date
yellow_links_filtered = filter_links_by_date(yellow_links, start_date, end_date)
fhvhv_links_filtered = filter_links_by_date(fhvhv_links, start_date, end_date)

# Download filtered files
download_parquet_files(yellow_links_filtered, "yellow_taxi_data")
download_parquet_files(fhvhv_links_filtered, "fhvhv_data")

Downloading Yellow Taxi files...
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet : HTTP 403
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet : HTTP 403
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet : HTTP 403
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet  after 3 attempts.
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet : HTTP 403
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet : HTTP 403
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet : HTTP 403
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet  after 3 attempts.
Failed to download https://d37ci6vzurychx.cloudfront.net/trip-data/yell

### 2. Sampling with Cochran's Formula



In [28]:
import pandas as pd
import math
import os
import glob

#### 2.1 Define the Sampling Function


In [31]:
def cochran_sample_size(population_size, confidence_level=0.95, p=0.5, margin_of_error=0.05):
    # Z-scores for common confidence levels
    z_scores = {0.9: 1.645, 0.95: 1.96, 0.99: 2.576}
    z = z_scores[confidence_level]
    
    # Cochran's initial sample size
    n_0 = (z**2 * p * (1 - p)) / (margin_of_error**2)
    
    # Adjust sample size for finite population
    if population_size > 0:
        n = n_0 / (1 + (n_0 - 1) / population_size)
    else:
        n = n_0  # Default to initial sample size if population size is unknown
    
    return math.ceil(n)

#### 2.2 Sampling for Both Datasets

In [36]:
# Function to sample a monthly dataset
def sample_monthly_data(file_path, sample_size):
    df = pd.read_parquet(file_path)
    sampled_df = df.sample(n=sample_size, random_state=42) 
    return sampled_df

# Sampling logic for Yellow Taxi and FHVHV datasets
def process_data(data_path_pattern, output_dir, dataset_type):

    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
    file_paths = sorted(glob.glob(data_path_pattern))  # Get all matching files
    
    for file_path in file_paths:
        # Load dataset to calculate population size
        population_size = len(pd.read_parquet(file_path))
        
        # Calculate sample size
        sample_size = cochran_sample_size(population_size, confidence_level=0.95, margin_of_error=0.05)
        print(f"{dataset_type} - {os.path.basename(file_path)}: Population size = {population_size}, Sample size = {sample_size}")
        
        # Sample data
        sampled_data = sample_monthly_data(file_path, sample_size)
        
        # Save sampled data
        output_file = os.path.join(output_dir, os.path.basename(file_path))
        sampled_data.to_parquet(output_file)
        print(f"Sampled data saved to: {output_file}")


# Process both datasets
process_data("yellow_taxi_data/yellow_tripdata_202*.parquet", "yellow_taxi_sampled_data", "Yellow Taxi")
process_data("fhvhv_data/fhvhv_tripdata_202*.parquet", "fhvhv_sampled_data", "FHVHV")

Yellow Taxi - yellow_tripdata_2020-01.parquet: Population size = 6405008, Sample size = 385
Sampled data saved to: yellow_taxi_sampled_data/yellow_tripdata_2020-01.parquet
Yellow Taxi - yellow_tripdata_2020-02.parquet: Population size = 6299367, Sample size = 385
Sampled data saved to: yellow_taxi_sampled_data/yellow_tripdata_2020-02.parquet
Yellow Taxi - yellow_tripdata_2020-03.parquet: Population size = 3007687, Sample size = 385
Sampled data saved to: yellow_taxi_sampled_data/yellow_tripdata_2020-03.parquet
Yellow Taxi - yellow_tripdata_2020-04.parquet: Population size = 238073, Sample size = 384
Sampled data saved to: yellow_taxi_sampled_data/yellow_tripdata_2020-04.parquet
Yellow Taxi - yellow_tripdata_2020-05.parquet: Population size = 348415, Sample size = 384
Sampled data saved to: yellow_taxi_sampled_data/yellow_tripdata_2020-05.parquet
Yellow Taxi - yellow_tripdata_2020-06.parquet: Population size = 549797, Sample size = 384
Sampled data saved to: yellow_taxi_sampled_data/yel

In [46]:
import pandas as pd

# Path to the file
file_path = "/Users/vivianwang/Desktop/GitHub/final_project01/fhvhv_sampled_data/fhvhv_tripdata_2022-06.parquet"

# Read the file
df1 = pd.read_parquet(file_path)

# Display column names
df1

# Path to the file
file_path = "/Users/vivianwang/Desktop/GitHub/final_project01/fhvhv_sampled_data/fhvhv_tripdata_2022-06.parquet"

# Read the file
df1 = pd.read_parquet(file_path)

# Display column names
df1

Unnamed: 0,hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,...,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
9141359,HV0003,B03404,B03404,2022-06-16 12:51:16,2022-06-16 12:52:50,2022-06-16 12:52:58,2022-06-16 13:05:22,242,185,2.330,...,1.01,0.00,0.0,0.00,9.27,N,N,,N,N
6125422,HV0003,B03404,B03404,2022-06-11 09:01:19,2022-06-11 09:08:04,2022-06-11 09:08:23,2022-06-11 09:34:46,17,39,5.380,...,2.59,0.00,0.0,0.00,26.38,N,N,,N,N
11227438,HV0005,B03406,,2022-06-19 16:30:56,NaT,2022-06-19 16:40:00,2022-06-19 16:58:08,234,142,2.742,...,2.13,2.75,0.0,0.00,14.84,N,N,N,N,N
10779540,HV0003,B03404,B03404,2022-06-18 22:34:47,2022-06-18 22:41:40,2022-06-18 22:42:11,2022-06-18 22:47:50,197,134,1.280,...,0.70,0.00,0.0,0.00,6.94,N,N,,N,N
9767195,HV0003,B03404,B03404,2022-06-17 12:17:01,2022-06-17 12:21:21,2022-06-17 12:23:12,2022-06-17 12:45:46,61,39,3.720,...,2.45,0.00,0.0,0.00,21.84,N,N,,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1373264,HV0003,B03404,B03404,2022-06-03 11:04:14,2022-06-03 11:04:37,2022-06-03 11:06:36,2022-06-03 11:08:59,79,113,0.280,...,0.68,2.75,0.0,0.00,8.37,N,N,,N,N
5689953,HV0003,B03404,B03404,2022-06-10 16:54:35,2022-06-10 17:06:35,2022-06-10 17:06:50,2022-06-10 17:50:58,69,182,5.310,...,3.06,0.00,0.0,0.00,31.02,N,N,,N,N
6906917,HV0003,B03404,B03404,2022-06-12 12:03:36,2022-06-12 12:11:27,2022-06-12 12:13:28,2022-06-12 13:38:38,246,129,27.500,...,8.35,2.75,0.0,0.00,78.30,N,N,,N,N
9156960,HV0005,B03406,,2022-06-16 12:39:43,NaT,2022-06-16 12:41:30,2022-06-16 13:11:25,244,163,7.110,...,2.63,2.75,0.0,5.39,24.19,N,N,N,N,N


### Load Taxi Zones

In [None]:
def load_taxi_zones(shapefile):
    raise NotImplementedError()

In [None]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    raise NotImplementedError()

### Calculate Sample Size

In [None]:
def calculate_sample_size(population):
    raise NotImplementedError()

### Common Functions

In [None]:
def get_all_urls_from_tlc_page(taxi_page):
    raise NotImplementedError()

In [None]:
def filter_parquet_urls(all_urls):
    raise NotImplementedError()

### Process Taxi Data

In [None]:
def get_and_clean_taxi_month(url):
    raise NotImplementedError()

In [None]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.contact(all_taxi_dataframes)
    return taxi_data

In [None]:
def get_taxi_data():
    all_urls = get_all_urls_from_taxi_page(TLC_URL)
    all_parquet_urls = find_taxi_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [None]:
taxi_data = get_taxi_data()

In [None]:
taxi_data.head()

In [None]:
taxi_data.info()

In [None]:
taxi_data.describe()

### Processing Uber Data

In [None]:
def get_and_clean_uber_month(url):
    raise NotImplementedError()

In [None]:
def get_and_clean_uber_data(parquet_urls):
    all_uber_dataframes = []
    
    for parquet_url in parquet_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(parquet_url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.contact(all_uber_dataframes)
    return uber_data

In [None]:
def load_and_clean_uber_data():
    raise NotImplementedError()

In [None]:
def get_uber_data():
    all_urls = get_all_urls_from_tlc_page(TLC_URL)
    all_parquet_urls = find_parquet_urls(all_urls)
    taxi_data = get_and_clean_uber_data(all_parquet_urls)
    return taxi_data

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

In [None]:
uber_data.info()

In [None]:
uber_data.describe()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)

### 3. Cleaning & Filtering Data

In [None]:

import geopandas as gpd

def clean_and_filter_data(df, zones_shapefile, bounding_box):
    zones = gpd.read_file(zones_shapefile)
    zones['centroid'] = zones['geometry'].centroid
    zone_centroids = zones.set_index('LocationID')['centroid']
    df['pickup_location'] = df['PULocationID'].map(zone_centroids)
    df['dropoff_location'] = df['DOLocationID'].map(zone_centroids)
    df = df.dropna(subset=['pickup_location', 'dropoff_location'])
    min_lat, min_lon, max_lat, max_lon = bounding_box
    df = df[
        (df['pickup_location'].y >= min_lat) &
        (df['pickup_location'].y <= max_lat) &
        (df['pickup_location'].x >= min_lon) &
        (df['pickup_location'].x <= max_lon)
    ]
    df.columns = df.columns.str.lower()
    return df

# Example usage
bounding_box = (40.560445, -74.242330, 40.908524, -73.717047)
zones_shapefile = "taxi_zones/taxi_zones.shp"
cleaned_data = clean_and_filter_data(sampled_data, zones_shapefile, bounding_box)
