In [58]:
import pandas as pd
import numpy as np

In [59]:
df = pd.read_csv(r'..\..\datasets\1. Originales\TLC Aggregated Data\data_reports_monthly.csv')

In [60]:
df.columns = [
    'date', 'industry', 'trips_per_day', 'farebox_per_day',
    'unique_drivers', 'unique_vehicles', 'vehicles_per_day',
    'avg_days_vehicles_on_road', 'avg_hours_per_day_per_vehicle',
    'avg_days_drivers_on_road', 'avg_hours_per_day_per_driver',
    'avg_minutes_per_trip', 'percent_of_trips_paid_with_credit_card',
    'trips_per_day_shared'
]

In [61]:
# Columnas incuidas
columns_to_replace = df.columns.difference(['date', 'industry'])
# Reemplazar '-' por '' en las columnas seleccionadas
df[columns_to_replace] = df[columns_to_replace].replace({'-':np.nan,',':'','%': ''}, regex=True)

In [62]:
df['date'] = pd.to_datetime(df['date'], format='%Y-%m')
df['industry'] = df['industry'].astype('category')
df['trips_per_day'] = df['trips_per_day'].astype('Int64')
df['farebox_per_day'] = df['farebox_per_day'].astype('Int64')
df['unique_drivers'] = df['unique_drivers'].astype('Int64')
df['unique_vehicles'] = df['unique_vehicles'].astype('Int64')
df['vehicles_per_day'] = df['vehicles_per_day'].astype('Int64')
df['percent_of_trips_paid_with_credit_card'] = pd.to_numeric(df['percent_of_trips_paid_with_credit_card'], errors='coerce') / 100
df['trips_per_day_shared'] = pd.to_numeric(df['trips_per_day_shared'], errors='coerce') 

In [63]:
# Reemplazar 'Green' y 'Yellow' para igual al datset diario
df['industry'] = df['industry'].replace({'Green': 'Green Taxi', 'Yellow': 'Yellow Taxi'})

  df['industry'] = df['industry'].replace({'Green': 'Green Taxi', 'Yellow': 'Yellow Taxi'})


In [64]:
# Agregar "FHV - Other" a las categorías de la columna 'industry' para reemplazar las que no están en el datset diario  
df['industry'] = df['industry'].cat.add_categories('FHV - Other')

# Luego, realizar la asignación de las industrias
other_industries = ['FHV - Black Car', 'FHV - Livery', 'FHV - Lux Limo']
df.loc[df['industry'].isin(other_industries), 'industry'] = 'FHV - Other'

In [65]:
# Agrupar por 'month_year' y 'industry', y agregar 'FHV - Other' por mes
df_other = df[df['industry'] == 'FHV - Other'].groupby('date').agg({
    'trips_per_day': 'sum',
    'farebox_per_day': 'sum',
    'unique_drivers': 'sum',
    'unique_vehicles': 'sum',
    'vehicles_per_day': 'sum',
    'avg_days_vehicles_on_road': 'mean',
    'avg_hours_per_day_per_vehicle': 'mean',
    'avg_days_drivers_on_road': 'mean',
    'avg_hours_per_day_per_driver': 'mean',
    'avg_minutes_per_trip': 'mean',
    'percent_of_trips_paid_with_credit_card': 'mean',
    'trips_per_day_shared': 'sum'
}).reset_index()

# Reasignar índices y juntar los DataFrames
df_other['industry'] = 'FHV - Other'  # Asegurarse de que la columna 'industry' sea consistente
df = pd.concat([df[df['industry'] != 'FHV - Other'], df_other], ignore_index=True)

In [66]:
df[df['date'] == '2024-08-01']

Unnamed: 0,date,industry,trips_per_day,farebox_per_day,unique_drivers,unique_vehicles,vehicles_per_day,avg_days_vehicles_on_road,avg_hours_per_day_per_vehicle,avg_days_drivers_on_road,avg_hours_per_day_per_driver,avg_minutes_per_trip,percent_of_trips_paid_with_credit_card,trips_per_day_shared
0,2024-08-01,FHV - High Volume,617043,,81306,80570,55030,21.2,6.3,21.3,6.3,19.0,,10185.0
1,2024-08-01,Green Taxi,1654,38632.0,728,694,394,17.6,3.5,17.0,3.4,14.9,0.72,
2,2024-08-01,Yellow Taxi,94732,2360656.0,12276,9125,7822,26.6,7.6,22.6,6.6,16.2,0.81,
540,2024-08-01,FHV - Other,47825,0.0,19842,19783,8831,12.966667,3.533333,13.0,3.466667,38.666667,,0.0


In [67]:
df.to_csv(r'..\..\datasets\2. Depurados\TLC Aggregated Data\data_reports_monthly.csv',index=False)  

---
---
Junto los dos datasets en un solo archivo

In [None]:
df1 = pd.read_csv(r'..\..\datasets\2. Depurados\TLC Aggregated Data\TLC Trip Record Data_viajes_by_industry.csv')
df2 = pd.read_csv(r'..\..\datasets\2. Depurados\TLC Aggregated Data\data_reports_monthly.csv')

In [69]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               176 non-null    int64  
 1   month              176 non-null    int64  
 2   industry           176 non-null    object 
 3   total_trips        176 non-null    int64  
 4   passenger_count    176 non-null    int64  
 5   trip_distance      176 non-null    float64
 6   trip_duration      176 non-null    float64
 7   avg_trip_distance  176 non-null    float64
 8   avg_trip_duration  176 non-null    float64
 9   fare_amount        176 non-null    float64
 10  total_amount       176 non-null    float64
 11  shared_match_flag  176 non-null    int64  
dtypes: float64(6), int64(5), object(1)
memory usage: 16.6+ KB


In [70]:
# Acululamos desde el 2021
df2 = df2[df2['date'] >= '2021-01-01']

In [71]:
# Creamos el campo que vincula los df
df1['date'] = pd.to_datetime(df1['year'].astype(str) + '-' + df1['month'].astype(str), format='%Y-%m')
df1['date'] = pd.to_datetime(df1['date'], errors='coerce')
df2['date'] = pd.to_datetime(df2['date'], errors='coerce')

In [72]:
# Hacemos la únión la unión
df = pd.merge(df2,df1, on=['date', 'industry'], how='outer')

In [73]:
df.head()

Unnamed: 0,date,industry,trips_per_day,farebox_per_day,unique_drivers,unique_vehicles,vehicles_per_day,avg_days_vehicles_on_road,avg_hours_per_day_per_vehicle,avg_days_drivers_on_road,...,month,total_trips,passenger_count,trip_distance,trip_duration,avg_trip_distance,avg_trip_duration,fare_amount,total_amount,shared_match_flag
0,2021-01-01,FHV - High Volume,383951,,47623,47594,28584,18.6,6.9,18.7,...,1,11507828,0,40319280.0,182400000.0,3.64048,15.850081,163194200.0,191768800.0,0
1,2021-01-01,FHV - Other,36850,0.0,10255,10128,5402,14.7,3.833333,14.633333,...,1,181335,0,600032.6,3517330.0,3.633016,19.396861,0.0,0.0,0
2,2021-01-01,Green Taxi,2467,56409.0,991,982,491,15.5,4.1,15.5,...,1,75094,47164,242119.4,1280683.0,3.459244,17.054398,1096283.0,1320949.0,3794
3,2021-01-01,Yellow Taxi,44052,688808.0,5396,4625,3256,21.8,7.5,19.4,...,1,1338358,1752377,3168197.0,15526940.0,2.428338,11.601482,13452060.0,20120130.0,271651
4,2021-02-01,FHV - High Volume,414782,,47208,47272,28639,17.0,7.1,17.1,...,2,11185277,0,38734310.0,188035600.0,3.600413,16.810996,176469800.0,207024500.0,0


In [74]:

# Obtener la cantidad de días en el mes de cada registro
df['days_in_month'] = df['date'].dt.days_in_month
# Calcular total_trips como trips_per_day * days_in_month
df['total_trips'] = df['trips_per_day'] * df['days_in_month']

# Crear el nuevo campo 'farebox_per_day' con valores diferenciados
df['farebox_per_day'] = np.where(
    df['industry'] == 'FHV - High Volume',
    df['total_amount'] / df['days_in_month'],
    df['farebox_per_day']
)

# Crear el nuevo campo 'total_amount' con valores diferenciados
df['total_amount'] = np.where(
    df['industry'] == 'FHV - High Volume',
    df['total_amount'],
    df['farebox_per_day'] * df['days_in_month']
)

# Lo mismo pero para cantidad de viajes compartidos
df['shared_match_flag'] = np.where(
    df['industry'] == 'FHV - High Volume',
    df['trips_per_day_shared'] * df['days_in_month'],
    df['shared_match_flag']
)

# Lo mismo pero para cantidad de viajes compartidos
df['trips_per_day_shared'] = np.where(
    df['industry'] == 'FHV - High Volume',
    df['trips_per_day_shared'],
    df['shared_match_flag'] / df['days_in_month']
)

df['farebox_per_day_per_distance']=df['farebox_per_day']/df['avg_trip_distance']

df['total_co2_emission']=df['avg_trip_distance'] * df['total_trips'] * 400 / 1000000

In [None]:
# Redondear las columnas de tipo DECIMAL antes de cargarlas en SQL
df['farebox_per_day'] = df['farebox_per_day'].round(2)
df['avg_days_vehicles_on_road'] = df['avg_days_vehicles_on_road'].round(2)
df['avg_hours_per_day_per_vehicle'] = df['avg_hours_per_day_per_vehicle'].round(2)
df['avg_days_drivers_on_road'] = df['avg_days_drivers_on_road'].round(2)
df['avg_hours_per_day_per_driver'] = df['avg_hours_per_day_per_driver'].round(2)
df['percent_of_trips_paid_with_credit_card'] = df['percent_of_trips_paid_with_credit_card'].round(2)
df['avg_trip_distance'] = df['avg_trip_distance'].round(2)
df['avg_trip_duration'] = df['avg_trip_duration'].round(2)
df['total_amount'] = df['total_amount'].round(2)
df['farebox_per_day_per_distance'] = df['farebox_per_day_per_distance'].round(4)
df['total_co2_emission'] = df['total_co2_emission'].round(4)

In [76]:
# Empezamos a trabajar con el dataset eliminando columnas o componiendo datos
df = df.drop(columns=['year', 'month','trip_distance','trip_duration','fare_amount','avg_minutes_per_trip'])
df['farebox_per_day'] = df['farebox_per_day'].replace(0, np.nan) 
df['trips_per_day_shared'] = df['trips_per_day_shared'].replace(0, np.nan) 
df['passenger_count'] = df['passenger_count'].replace(0, np.nan) 
df['total_amount'] = df['total_amount'].replace(0, np.nan) 
df['shared_match_flag'] = df['shared_match_flag'].replace(0, np.nan)
df['farebox_per_day_per_distance'] = df['farebox_per_day_per_distance'].replace(0, np.nan)

In [77]:
# Eliminar columnas duplicadas
df = df.loc[:, ~df.columns.duplicated()]
# Guardamos el DataFrame combinado en un archivo CSV
df.to_csv(r'..\..\datasets\2. Depurados\TLC Aggregated Data\merged_taxi_data.csv', index=False)