In [1]:
import os

os.chdir("../../")

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from taxi.utils.utils import *
from taxi.configs.config import *
from taxi.utils.helpers import *
from pathlib import Path
from sklearn.model_selection import train_test_split
%matplotlib inline

[32m2024-07-08 02:03:35.530[0m | [1mINFO    [0m | [36mtaxi.utils.utils[0m:[36mread_yaml[0m:[36m29[0m - [1myaml file: config.yaml loaded successfully[0m
[32m2024-07-08 02:03:35.537[0m | [1mINFO    [0m | [36mtaxi.utils.utils[0m:[36mread_yaml[0m:[36m29[0m - [1myaml file: params.yaml loaded successfully[0m


**Column Descriptions**

`VendorID`: Identifier for the TPEP provider supplying the record.
- 1 = Creative Mobile Technologies, LLC
- 2 = VeriFone Inc.

`tpep_pickup_datetime`: The date and time when the meter was activated.

`tpep_dropoff_datetime`: The date and time when the meter was turned off.

`Passenger_count`: The number of passengers in the vehicle, as entered by the driver.

`Trip_distance`: The distance of the trip in miles, as recorded by the taximeter.

`PULocationID`: TLC Taxi Zone where the meter was engaged.

`DOLocationID`: TLC Taxi Zone where the meter was disengaged.

`RateCodeID`: The applicable rate code at the end of the trip.
- 1 = Standard rate
- 2 = JFK
- 3 = Newark
- 4 = Nassau or Westchester
- 5 = Negotiated fare
- 6 = Group ride

`Store_and_fwd_flag`: Indicates if the trip record was stored in the vehicle's memory before transmission to the vendor due to lack of server connection.
- Y = Store and forward trip
- N = Not a store and forward trip

`Payment_type`: How the passenger paid for the trip, represented by a numeric code.
- 1 = Credit card
- 2 = Cash
- 3 = No charge
- 4 = Dispute
- 5 = Unknown
- 6 = Voided trip

`Fare_amount`: The fare as calculated by the meter based on time and distance.

`Extra`: Additional charges, currently including only the $0.50 and $1 rush hour and overnight charges.

`MTA_tax`: A $0.50 tax automatically added based on the metered rate.

`Improvement_surcharge`: A $0.30 surcharge added at the start of the trip, implemented since 2015.

`Tip_amount`: Credit card tip amounts. (Note: Cash tips are not recorded here.)

`Tolls_amount`: Total tolls paid during the trip.

`Total_amount`: The total charge to passengers, excluding cash tips.

In [8]:
class Data:
    def __init__(self):
        self.config = CONFIG

    def read_dataset(self):
        """
        Extracts dataset from a zip file if not already extracted,
        loads it into a Pandas dataframe, and drops specified columns.
        """
        if not (
            os.path.exists(
                f"{self.config.Data.DATA_DIR}/{self.config.Data.DATA_FILE_NAME}"
            )
        ):
            with zipfile.ZipFile(self.config.Data.DATA_DIR_ZIP, "r") as zip_ref:
                zip_ref.extract(
                    self.config.Data.DATA_FILE_NAME, self.config.Data.DATA_DIR
                )
                zip_ref.close()
        self.df = pd.read_csv(
            f"{self.config.Data.DATA_DIR}/{self.config.Data.DATA_FILE_NAME}"
        ).drop(columns=PARAMS.DATASET.COLUMNS_TO_DROP)
        return self.df

    def calculate_percentiles_for_each_group(self):
        """
        Calculates percentiles for specified group columns and optionally for trip_distance categories.
        """
        ## Question A
        results = pd.DataFrame()
        group_columns = ["VendorID", "passenger_count", "payment_type"]
        # Calculate percentiles for each group column
        for group_col in group_columns:
            percentile_result = (
                self.df.groupby(group_col)
                .apply(calculate_percentiles, include_groups=False)
                .reset_index()
            )
            percentile_result[group_col] = percentile_result[group_col].apply(
                lambda x: f"{group_col}_{x}"
            )
            percentile_result.set_index(group_col, inplace=True)
            results = pd.concat([results, percentile_result])

        ####### Question A.1 (optional): Calculate percentiles for trip_distance categories

        # Calculate percentiles for trip_distance > 2.8
        self.df["trip_distance_bucket"] = np.where(
            self.df["trip_distance"] <= 2.8, "trip_distance<=2.8", "trip_distance>2.8"
        )

        percentile_over_2_8 = (
            self.df[self.df["trip_distance_bucket"] == "trip_distance>2.8"]
            .groupby(["trip_distance_bucket"])
            .apply(calculate_percentiles, include_groups=False)
            .reset_index()
        )
        percentile_over_2_8.set_index("trip_distance_bucket", inplace=True)
        percentile_under_eq_2_8 = (
            self.df[self.df["trip_distance_bucket"] == "trip_distance<=2.8"]
            .groupby(["trip_distance_bucket"])
            .apply(calculate_percentiles, include_groups=False)
            .reset_index()
        )
        percentile_under_eq_2_8.set_index("trip_distance_bucket", inplace=True)
        percentile_results = pd.concat(
            [results, percentile_over_2_8, percentile_under_eq_2_8]
        )
        self.df = self.df[PARAMS.DATASET.COLUMNS_TO_USE]
        return percentile_results
    @staticmethod
    def save_csv(df):
        if not os.path.exists(f'{CONFIG.QA.PERCENTILE_DATAFRAME_PATH}'): 
            os.makedirs(f'{CONFIG.QA.PERCENTILE_DATAFRAME_PATH}')
            df.to_csv(f'{CONFIG.QA.PERCENTILE_DATAFRAME_PATH}/{CONFIG.QA.PERCENTILE_DATAFRAME_FILE}')

In [9]:
# pipeline
data_obj = Data()
df = data_obj.read_dataset()
percentiles = data_obj.calculate_percentiles_for_each_group()
data_obj.save_csv(percentiles)
percentiles

Unnamed: 0,fare_amount_p_5,fare_amount_p_50,fare_amount_p_95,tip_amount_p_5,tip_amount_p_50,tip_amount_p_95,total_amount_p_5,total_amount_p_50,total_amount_p_95
VendorID_1,4.5,9.5,36.0,0.0,1.86,6.55,8.3,14.75,48.3
VendorID_2,4.5,9.5,40.0,0.0,1.96,7.36,8.3,14.76,52.7
VendorID_4,4.0,9.0,35.275,0.0,1.96,7.1175,8.3,14.75,48.65
passenger_count_0,4.0,9.0,37.5,0.0,1.85,6.65,8.3,14.72,49.8
passenger_count_1,4.5,9.5,37.0,0.0,1.95,7.0,8.3,14.75,49.9
passenger_count_2,4.5,9.5,42.0,0.0,1.86,7.2,8.3,14.8,54.5
passenger_count_3,4.5,9.5,42.0,0.0,1.85,7.0,8.3,14.8,54.537
passenger_count_4,4.5,10.0,50.0,0.0,1.58,6.99,8.75,15.3,58.92
passenger_count_5,4.5,9.5,37.5,0.0,1.96,7.18,8.3,14.76,50.52
passenger_count_6,4.5,9.5,38.0,0.0,1.96,7.28,8.3,14.76,51.0
