In [1]:
import pandas as pd
import holidays

In [2]:
# === STEP 1: Define file paths ===
file_1 = "/Users/elliekavanagh/Downloads/Clean_Jan_Feb_Taxi.csv"
file_2 = "/Users/elliekavanagh/Downloads/Clean_March_April_Taxi.csv"

# === STEP 2: Load both files ===
df1 = pd.read_csv(file_1, parse_dates=['tpep_pickup_datetime'])
df2 = pd.read_csv(file_2, parse_dates=['tpep_pickup_datetime'])

In [3]:
# === STEP 3: Combine them ===
df = pd.concat([df1, df2], ignore_index=True)

In [4]:
df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,trip_duration_min,pickup_date,pickup_hour,pickup_day_of_week,droppoff_date,pickup_borough,...,pickup_service_zone,dropoff_borough,dropoff_zone,dropoff_service_zone,fare_per_minute,trip_speed,trip_speed_mph,time_of_day,day_type,is_weekend
0,2023-02-01 00:00:00-05:00,2023-02-01 00:15:00-05:00,3.10,16.83,15.000000,2023-02-01,0,2,2023-02-01,Manhattan,...,Yellow Zone,Manhattan,East Village,Yellow Zone,1.122000,0.206667,12.400000,Early Morning,Weekday,False
1,2023-02-01 00:00:01-05:00,2023-02-01 00:33:41-05:00,17.31,70.00,33.666667,2023-02-01,0,2,2023-02-01,Queens,...,Airports,Manhattan,Murray Hill,Yellow Zone,2.079208,0.514158,30.849505,Early Morning,Weekday,False
2,2023-02-01 00:00:02-05:00,2023-02-01 00:11:08-05:00,1.91,12.80,11.100000,2023-02-01,0,2,2023-02-01,Manhattan,...,Yellow Zone,Manhattan,Clinton East,Yellow Zone,1.153153,0.172072,10.324324,Early Morning,Weekday,False
3,2023-02-01 00:00:04-05:00,2023-02-01 00:25:20-05:00,6.40,29.60,25.266667,2023-02-01,0,2,2023-02-01,Manhattan,...,Yellow Zone,Brooklyn,Park Slope,Boro Zone,1.171504,0.253298,15.197889,Early Morning,Weekday,False
4,2023-02-01 00:00:07-05:00,2023-02-01 00:03:10-05:00,1.12,6.50,3.050000,2023-02-01,0,2,2023-02-01,Manhattan,...,Yellow Zone,Manhattan,East Village,Yellow Zone,2.131148,0.367213,22.032787,Early Morning,Weekday,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12060286,2023-03-31 23:59:56-04:00,2023-04-01 00:09:33-04:00,1.47,10.70,9.616667,2023-03-31,23,4,2023-03-31,Manhattan,...,Yellow Zone,Manhattan,Midtown Center,Yellow Zone,1.112652,0.152860,9.171577,Night,Weekday,False
12060287,2023-03-31 23:59:58-04:00,2023-04-01 00:18:19-04:00,2.73,18.40,18.350000,2023-03-31,23,4,2023-03-31,Manhattan,...,Yellow Zone,Manhattan,Garment District,Yellow Zone,1.002725,0.148774,8.926431,Night,Weekday,False
12060288,2023-03-31 23:59:59-04:00,2023-04-01 00:10:01-04:00,0.94,10.00,10.033333,2023-03-31,23,4,2023-03-31,Manhattan,...,Yellow Zone,Manhattan,West Village,Yellow Zone,0.996678,0.093688,5.621262,Night,Weekday,False
12060289,2023-03-31 23:59:59-04:00,2023-04-01 00:22:25-04:00,5.20,25.40,22.433333,2023-03-31,23,4,2023-03-31,Manhattan,...,Yellow Zone,Manhattan,Upper East Side North,Yellow Zone,1.132244,0.231798,13.907875,Night,Weekday,False


In [5]:
# === STEP 4: Clean and convert ===
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df = df.dropna(subset=['tpep_pickup_datetime'])



In [6]:
print(df['tpep_pickup_datetime'].dtype)  # Should now say datetime64[ns]



datetime64[ns, UTC-05:00]


In [7]:
# === STEP 5: Create date and hour columns ===
df['pickup_date'] = df['tpep_pickup_datetime'].dt.date
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_month'] = pd.to_datetime(df['pickup_date']).dt.month
df['pickup_day_of_week'] = pd.to_datetime(df['pickup_date']).dt.dayofweek

# === 4. Add weekend, holiday, day_type ===
df['is_weekend'] = df['pickup_day_of_week'].isin([5, 6])  # Sat/Sun
us_holidays = holidays.US(years=[2023])
df['is_holiday'] = df['pickup_date'].isin(us_holidays)
df['day_type'] = df['is_weekend'].map({True: 'Weekend', False: 'Weekday'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pickup_date'] = df['tpep_pickup_datetime'].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pickup_month'] = pd.to_datetime(df['pickup_date']).dt.month
A value is trying to be set on 

In [8]:
# === 5. Add time_of_day buckets ===
def get_time_of_day(hour):
    if 0 <= hour < 5:
        return 'Early Morning'
    elif 5 <= hour < 10:
        return 'Morning Rush'
    elif 10 <= hour < 15:
        return 'Midday'
    elif 15 <= hour < 29:
        return 'Evening Rush'
    else:
        return 'Night'


df['time_of_day'] = df['pickup_hour'].apply(get_time_of_day)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_of_day'] = df['pickup_hour'].apply(get_time_of_day)


In [9]:
# === STEP 6: Group by zone-date-hour ===
trip_counts = df.groupby([
    'pickup_date', 'pickup_hour', 'pickup_zone', 'pickup_borough',
    'day_type', 'pickup_month', 'pickup_day_of_week', 'is_weekend',
    'time_of_day', 'is_holiday'
]).size().reset_index(name='trip_count')
trip_counts['pickup_date'] = pd.to_datetime(trip_counts['pickup_date'])
trip_counts['pickup_month'] = trip_counts['pickup_date'].dt.to_period('M')

# === STEP 7: Save combined CSV ===
output_path = "/Users/elliekavanagh/Downloads/Trip_Counts_Jan_to_Apr.csv"
trip_counts.to_csv(output_path, index=False)

In [10]:

print(f"✅ Combined and aggregated data saved to: {output_path}")

✅ Combined and aggregated data saved to: /Users/elliekavanagh/Downloads/Trip_Counts_Jan_to_Apr.csv


In [11]:
trip_counts

Unnamed: 0,pickup_date,pickup_hour,pickup_zone,pickup_borough,day_type,pickup_month,pickup_day_of_week,is_weekend,time_of_day,is_holiday,trip_count
0,2023-01-01,0,Alphabet City,Manhattan,Weekend,2023-01,6,True,Early Morning,True,19
1,2023-01-01,0,Astoria,Queens,Weekend,2023-01,6,True,Early Morning,True,3
2,2023-01-01,0,Battery Park,Manhattan,Weekend,2023-01,6,True,Early Morning,True,1
3,2023-01-01,0,Battery Park City,Manhattan,Weekend,2023-01,6,True,Early Morning,True,14
4,2023-01-01,0,Bloomingdale,Manhattan,Weekend,2023-01,6,True,Early Morning,True,20
...,...,...,...,...,...,...,...,...,...,...,...
147206,2023-03-12,1,Williamsburg (South Side),Brooklyn,Weekend,2023-03,6,True,Early Morning,False,10
147207,2023-03-12,1,Woodside,Queens,Weekend,2023-03,6,True,Early Morning,False,2
147208,2023-03-12,1,World Trade Center,Manhattan,Weekend,2023-03,6,True,Early Morning,False,11
147209,2023-03-12,1,Yorkville East,Manhattan,Weekend,2023-03,6,True,Early Morning,False,13
