## Import the required packages

In [43]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from tkinter.filedialog import asksaveasfilename

## I. Data Cleaning

#### Load the raw CSV file

In [2]:
# Verify your working directory
print("Current working directory:", os.getcwd())

# Select your raw CSV for taxi trip records
Tk().withdraw()  # Hide the root window
file_path = askopenfilename(title="Select your local taxi data CSV (ensure it's the raw data)")
original_filename = os.path.basename(file_path) 

# Check and load
if not file_path or not os.path.exists(file_path):
    raise FileNotFoundError("File not found or not selected.")
else:
    df = pd.read_csv(file_path)
    print("Loaded file:", file_path)
    print("Initial shape:", df.shape)

Current working directory: /
Loaded file: /Users/elliekavanagh/Downloads/Data for Taxi Project/Jan_Feb_2023_Yellow_Taxi_Trip_Data.csv
Initial shape: (5980122, 19)


#### Initial Inspection

In [3]:
print("\nData Info:")
df.info()


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5980122 entries, 0 to 5980121
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   VendorID               int64  
 1   tpep_pickup_datetime   object 
 2   tpep_dropoff_datetime  object 
 3   passenger_count        float64
 4   trip_distance          float64
 5   RatecodeID             float64
 6   store_and_fwd_flag     object 
 7   PULocationID           int64  
 8   DOLocationID           int64  
 9   payment_type           int64  
 10  fare_amount            float64
 11  extra                  float64
 12  mta_tax                float64
 13  tip_amount             float64
 14  tolls_amount           float64
 15  improvement_surcharge  float64
 16  total_amount           float64
 17  congestion_surcharge   float64
 18  airport_fee            float64
dtypes: float64(12), int64(4), object(3)
memory usage: 866.9+ MB


In [4]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [5]:
# Print the first few rows for the dataframe
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,01/01/2023 12:00:00 AM,01/01/2023 12:08:00 AM,,1.53,,,42,41,0,12.98,0.0,0.5,0.0,0.0,1.0,14.48,,
1,2,01/01/2023 12:00:05 AM,01/01/2023 12:26:27 AM,1.0,1.32,1.0,N,249,186,2,21.9,1.0,0.5,0.0,0.0,1.0,26.9,2.5,0.0
2,2,01/01/2023 12:00:06 AM,01/01/2023 12:05:44 AM,1.0,1.7,1.0,N,125,68,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
3,2,01/01/2023 12:00:08 AM,01/01/2023 12:11:24 AM,1.0,3.1,1.0,N,42,244,2,16.3,1.0,0.5,0.0,0.0,1.0,18.8,0.0,0.0
4,2,01/01/2023 12:00:09 AM,01/01/2023 12:15:10 AM,1.0,3.8,1.0,N,79,231,1,19.8,1.0,0.5,7.44,0.0,1.0,32.24,2.5,0.0


#### Standardized Timestamps

In [6]:
# Convert datetime columns 
# Current Format '01/01/2023 12:00:00 AM'
# New Format: '%m/%d/%Y %I:%M:%S %p'
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p',errors='coerce')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p',errors='coerce')

In [7]:
# Localize timestamps to NYC timezone
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].dt.tz_localize(
    'America/New_York',
    ambiguous='NaT',
    nonexistent='shift_forward'
)

df['tpep_dropoff_datetime'] = df['tpep_dropoff_datetime'].dt.tz_localize(
    'America/New_York',
    ambiguous='NaT',
    nonexistent='shift_forward'
)

In [8]:
# Confirm that timestamps are now timezone-aware
print("Timezone info (pickup):", df['tpep_pickup_datetime'].dt.tz)

Timezone info (pickup): America/New_York


#### Trip Duration Calculation

In [9]:
# Compute trip duration (in minutes)
df['trip_duration_min'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [10]:
# Drop rows with NaT or negative durations
df = df[df['trip_duration_min'].notna()]
df = df[df['trip_duration_min'] > 0]

#### Data Type Conversion 

In [11]:
# Numeric conversion
num_cols = ['trip_distance', 'fare_amount', 'tip_amount', 'total_amount']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Categorical conversion
cat_cols = ['RatecodeID', 'payment_type', 'VendorID']
df[cat_cols] = df[cat_cols].astype('category')

#### Filter Out Invalid/Extreme Values

In [12]:
df = df[(df['trip_distance'] > 0) & (df['trip_distance'] < 100)] # Trip distance - Min: 0 miles, Max: 100 miles
df = df[df['fare_amount'] > 0] # Tripe fare amount must be greater than 0
df = df[df['trip_duration_min'] < 240]  # Trip duration is under 4 hours

#### Add Date and Time Features

In [13]:
# add pickup date and time features 
df['pickup_date'] = df['tpep_pickup_datetime'].dt.date
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek  # 0 = Monday

# add pickup date and time features 
df['droppoff_date'] = df['tpep_pickup_datetime'].dt.date
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek  # 0 = Monday

#### Check Pickup and Dropoff Location IDs

In [None]:
# Filter invalid PULocationID and DOLocationID
initial_rows = df.shape[0]
initial_rows
df = df[(df['PULocationID'].between(1, 263)) & (df['DOLocationID'].between(1, 263))]
print(f"Shape after location ID filtering: {df.shape}")

#### Load Taxi Zone Lookup File

In [14]:
Tk().withdraw()  # Hide the root window
zone_file_path = askopenfilename(title="Select the taxi zone lookup CSV")

In [15]:
# Check and load
if not zone_file_path or not os.path.exists(zone_file_path):
    raise FileNotFoundError("Zone lookup file not found or not selected.")
else:
    zones = pd.read_csv(zone_file_path, keep_default_na=True, delimiter=',', skipinitialspace=True)
    zone_lookup = zones.copy()
    print("Loaded zone lookup file:", zone_file_path)
    print("Zone file shape:", zones.shape)

Loaded zone lookup file: /Users/elliekavanagh/Downloads/Data for Taxi Project/taxi_zone_lookup.csv
Zone file shape: (265, 4)


In [16]:
# Merge pickup location info
df = df.merge(
    zone_lookup.rename(columns={
        "LocationID": "PULocationID",
        "Zone": "pickup_zone",
        "Borough": "pickup_borough",
        "service_zone": "pickup_service_zone"
    }),
    on="PULocationID", how="left"
)

In [17]:
# Merge dropoff location info
df = df.merge(
    zone_lookup.rename(columns={
        "LocationID": "DOLocationID",
        "Zone": "dropoff_zone",
        "Borough": "dropoff_borough",
        "service_zone": "dropoff_service_zone"
    }),
    on="DOLocationID", how="left"
)

#### Drop Columns

In [18]:
columns_to_drop = [
    'VendorID', 'RatecodeID', 'store_and_fwd_flag', 'passenger_count',
    'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
    'improvement_surcharge', 'congestion_surcharge', 'airport_fee',
    'total_amount',  # derived field
    'PULocationID', 'DOLocationID'  # drop since we now have zone/borough
]
df = df.drop(columns=columns_to_drop)

#### Add Additional Features

In [19]:
# Derived feature for fare per minute (target variable)
df['fare_per_minute'] = df['fare_amount'] / df['trip_duration_min']
# Filter out extreme fare per minute values
# This removes any fare_per_minute that is zero or above the 99th percentile
df = df[(df['fare_per_minute'] > 0) & (df['fare_per_minute'] < df['fare_per_minute'].quantile(0.99))]

In [20]:
# Derived feature for trip speed
df['trip_speed'] = df['trip_distance'] / df['trip_duration_min']  # miles per minute
df['trip_speed_mph'] = df['trip_speed'] * 60
df = df[(df['trip_speed_mph'] > 1) & (df['trip_speed_mph'] < 60)] # reasonable speed limits

In [21]:
# Deriving feature for time of day based on segmenting
def time_of_day(hour):
    if 0 <= hour < 5:
        return 'Early Morning'
    elif 5 <= hour < 10:
        return 'Morning Rush'
    elif 10 <= hour < 15:
        return 'Midday'
    elif 15 <= hour < 19:
        return 'Evening Rush'
    else:
        return 'Night'
df['time_of_day'] = df['pickup_hour'].apply(time_of_day)

In [24]:
# Create weekday/weekend label
df['day_type'] = df['pickup_day_of_week'].apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')
#Add Day Type (Weekday vs Weekend) weekend flag
df['is_weekend'] = df['pickup_day_of_week'].isin([5, 6])  # Saturday = 5, Sunday = 6

#### Save Clean DF to CSV

In [27]:
df['fare_amount'].describe()

count    5.781177e+06
mean     1.821540e+01
std      1.571209e+01
min      1.000000e-02
25%      9.300000e+00
50%      1.280000e+01
75%      1.980000e+01
max      5.133000e+02
Name: fare_amount, dtype: float64

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5781177 entries, 0 to 5842103
Data columns (total 21 columns):
 #   Column                 Dtype                           
---  ------                 -----                           
 0   tpep_pickup_datetime   datetime64[ns, America/New_York]
 1   tpep_dropoff_datetime  datetime64[ns, America/New_York]
 2   trip_distance          float64                         
 3   fare_amount            float64                         
 4   trip_duration_min      float64                         
 5   pickup_date            datetime64[ns]                  
 6   pickup_hour            int32                           
 7   pickup_day_of_week     int32                           
 8   droppoff_date          datetime64[ns]                  
 9   pickup_borough         category                        
 10  pickup_zone            category                        
 11  pickup_service_zone    category                        
 12  dropoff_borough        category  

#### Convert Data Types Again

In [31]:
cat_cols = ['pickup_borough', 'pickup_zone', 'pickup_service_zone',
            'dropoff_borough', 'dropoff_zone', 'dropoff_service_zone',
            'time_of_day', 'day_type', ]

for col in cat_cols:
    df[col] = df[col].astype("category")

In [38]:
df['pickup_date'] = pd.to_datetime(df['pickup_date'])
df['droppoff_date'] = pd.to_datetime(df['droppoff_date'])

#### Save Cleaned Dataframe to CSV

In [44]:
default_filename = "Clean_" + original_filename

# Hide the root Tkinter window
Tk().withdraw()

# Open save file dialog (no default path logic)
save_path = asksaveasfilename(
    initialfile=default_filename,
    title="Select where to save cleaned taxi data CSV",
    defaultextension=".csv",
    filetypes=[("CSV files", "*.csv")]
)

if save_path:
    df.to_csv(save_path, index=False)
    print(f"Cleaned dataset saved to: {save_path}")
    print(f"Final shape: {df.shape}")
else:
    print("Save cancelled, file was not saved.")

Cleaned dataset saved to: /Users/elliekavanagh/Downloads/Data for Taxi Project/Clean_Jan_Feb_2023_Yellow_Taxi_Trip_Data.csv
Final shape: (5781177, 21)
