In [409]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# 1. CARGA Y EXPLORACIÓN INICIAL DEL DATASET
# ============================================================================
print("="*60)
print("1. CARGA Y EXPLORACIÓN INICIAL")
print("="*60)

# Cargar datos
df = pd.read_csv('uber_fares.csv')
print(f"Shape del dataset: {df.shape}")
print("\nPrimeras 5 filas:")
print(df.head())
print("\nInformación del dataset:")
print(df.info())
print("\nEstadísticas descriptivas:")
print(df.describe())

# TODO: Agregar análisis exploratorio más detallado
# - Verificar valores nulos
print("\nValores nulos por columna:")
print(df.isnull().sum())
print("\nEliminado de valores faltantes:")
df.dropna(inplace=True)
print(f"Nuevo shape del dataset: {df.shape}")
# - Detectar outliers
# - Visualizaciones iniciales

# ============================================================================
# 2. PROCESAMIENTO Y FEATURE ENGINEERING
# ============================================================================
print("\n" + "="*60)
print("2. PROCESAMIENTO Y FEATURE ENGINEERING")
print("="*60)

# 2.1 Procesamiento de fechas (usando nuestro código anterior)
df['date'] = pd.to_datetime(df['pickup_datetime']).dt.floor('s')

# Variables temporales básicas
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month

# Codificación cíclica
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Variables de contexto
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Análisis de demanda por hora
hourly_trip_counts = df.groupby('hour').size().reset_index(name='trips_per_hour')
df = df.merge(hourly_trip_counts, on='hour', how='left')

q33 = hourly_trip_counts['trips_per_hour'].quantile(0.33)
q66 = hourly_trip_counts['trips_per_hour'].quantile(0.66)

def classify_demand(trips):
    if trips <= q33:
        return 'low'
    elif trips <= q66:
        return 'medium'
    else:
        return 'high'

df['demand_level'] = df['trips_per_hour'].apply(classify_demand)
demand_dummies = pd.get_dummies(df['demand_level'], prefix='demand')
df = pd.concat([df, demand_dummies], axis=1)

df['is_peak_hour'] = df['trips_per_hour'] > q66

# 2.2 Cálculo de distancia (Fórmula de Haversine)
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calcula la distancia entre dos puntos en la Tierra usando la fórmula de Haversine
    """
    R = 6371  # Radio de la Tierra en km
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    
    return distance

df['distance'] = haversine_distance(
    df['pickup_latitude'], df['pickup_longitude'],
    df['dropoff_latitude'], df['dropoff_longitude']
)



1. CARGA Y EXPLORACIÓN INICIAL
Shape del dataset: (200000, 9)

Primeras 5 filas:
        key                           date  fare_amount  \
0  24238194    2015-05-07 19:52:06.0000003          7.5   
1  27835199    2009-07-17 20:04:56.0000002          7.7   
2  44984355   2009-08-24 21:45:00.00000061         12.9   
3  25894730    2009-06-26 08:22:21.0000001          5.3   
4  17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  

In [410]:
# Si es un dataset de NYC, filtrar por rangos geográficos válidos
def filtrar_coordenadas_nyc(df):
    """Filtrar coordenadas válidas para NYC"""
    mask = (
        (df['pickup_longitude'] >= -74.3) & (df['pickup_longitude'] <= -73.7) &
        (df['pickup_latitude'] >= 40.5) & (df['pickup_latitude'] <= 40.9) &
        (df['dropoff_longitude'] >= -74.3) & (df['dropoff_longitude'] <= -73.7) &
        (df['dropoff_latitude'] >= 40.5) & (df['dropoff_latitude'] <= 40.9)
    )
    return df[mask]

df = filtrar_coordenadas_nyc(df)

In [411]:
# plt.figure(figsize=(10,10))
# sns.lmplot(x='fare_amount',y='distance',data=df)
# plt.show()

In [412]:
# quita valores de distancia mayores a 75km
df = df[df['distance'] <= 75]
print(f"Shape después de filtrar distancias absurdas: {df.shape}")

# quitamos valores de distancia menores a 0.1km
df = df[df['distance'] >= 0.1]
print(f"Shape después de filtrar distance < 0.1: {df.shape}")

# quita fare_amount menor o igual a 0
df = df[df['fare_amount'] > 0]
print(f"Shape después de filtrar fare_amount <= 0: {df.shape}")

Shape después de filtrar distancias absurdas: (195489, 26)
Shape después de filtrar distance < 0.1: (192378, 26)
Shape después de filtrar fare_amount <= 0: (192363, 26)


In [413]:
# plt.figure(figsize=(10,10))
# sns.lmplot(x='fare_amount',y='distance',data=df)
# plt.show()

In [414]:
# si fare_amount es menor a 6, pero la distancia es mayor a 10km, lo quitamos
df = df[~((df['fare_amount'] < 10) & (df['distance'] > 10))]
print(f"Shape después de filtrar fare_amount < 6 y distance > 10: {df.shape}")

# si distancia es menor a 5km, pero fare_amount es mayor a 50, lo quitamos
df = df[~((df['distance'] < 5) & (df['fare_amount'] > 50))]
print(f"Shape después de filtrar distance < 1 y fare_amount > 50: {df.shape}")

Shape después de filtrar fare_amount < 6 y distance > 10: (192250, 26)
Shape después de filtrar distance < 1 y fare_amount > 50: (192142, 26)


In [415]:
# plt.figure(figsize=(10,10))
# sns.lmplot(x='fare_amount',y='distance',data=df)
# plt.show()

In [416]:
df

Unnamed: 0,key,date,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,...,month_sin,month_cos,is_weekend,trips_per_hour,demand_level,demand_high,demand_low,demand_medium,is_peak_hour,distance
0,24238194,2015-05-07 19:52:06+00:00,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,19,...,8.660254e-01,-5.000000e-01,0,12605,high,True,False,False,True,1.683323
1,27835199,2009-07-17 20:04:56+00:00,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1,20,...,1.224647e-16,-1.000000e+00,0,11755,high,True,False,False,True,2.457590
2,44984355,2009-08-24 21:45:00+00:00,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1,21,...,-5.000000e-01,-8.660254e-01,0,11446,high,True,False,False,True,5.036377
3,25894730,2009-06-26 08:22:21+00:00,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,8,...,5.000000e-01,-8.660254e-01,0,9075,medium,False,False,True,False,1.661683
4,17610152,2014-08-28 17:47:00+00:00,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,17,...,-5.000000e-01,-8.660254e-01,0,9758,medium,False,False,True,False,4.475450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,42598914,2012-10-28 10:49:00+00:00,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1,10,...,-1.000000e+00,-1.836970e-16,1,8944,medium,False,False,True,False,0.112210
199995,16382965,2014-03-14 01:09:00+00:00,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1,1,...,8.660254e-01,5.000000e-01,0,5908,low,False,True,False,False,1.875050
199996,27804658,2009-06-29 00:42:00+00:00,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2,0,...,5.000000e-01,-8.660254e-01,0,7844,low,False,True,False,False,12.850319
199997,20259894,2015-05-20 14:56:25+00:00,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1,14,...,8.660254e-01,-5.000000e-01,0,9749,medium,False,False,True,False,3.539715


In [417]:
df.drop(columns=[
    'key',
    'date',
    'pickup_datetime',
    'passenger_count',
    'hour',
    'day_of_week',
    'month',
    'demand_level',
    'demand_low',
    'trips_per_hour',
    'is_peak_hour'
], inplace=True)

In [418]:
df.columns.tolist()

['fare_amount',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'hour_sin',
 'hour_cos',
 'day_of_week_sin',
 'day_of_week_cos',
 'month_sin',
 'month_cos',
 'is_weekend',
 'demand_high',
 'demand_medium',
 'distance']

In [419]:
# División de los datos en train y test

X = df.drop(columns=['fare_amount'],axis=1)
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(
                                        X,y, test_size= 0.2,
                                        random_state = 42,
                                    )

In [420]:
y_test

31541      7.0
104916    17.7
46495     14.1
84732      6.9
129854    10.5
          ... 
167288    23.7
40694     22.0
138468    12.5
97225     19.5
136374     5.7
Name: fare_amount, Length: 38429, dtype: float64

In [421]:
X_train

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos,is_weekend,demand_high,demand_medium,distance
97002,-74.014618,40.714020,-73.991521,40.740045,-0.965926,-2.588190e-01,-0.781831,0.623490,-1.000000e+00,-1.836970e-16,1,False,True,3.487470
176731,-73.976145,40.788177,-73.977835,40.763020,-0.258819,-9.659258e-01,0.433884,-0.900969,-8.660254e-01,-5.000000e-01,0,True,False,2.800948
149013,-73.996895,40.724592,-73.995520,40.749452,-0.707107,-7.071068e-01,0.781831,0.623490,-8.660254e-01,-5.000000e-01,0,False,True,2.766732
7108,-73.948715,40.782158,-73.945572,40.777054,0.965926,-2.588190e-01,-0.433884,-0.900969,8.660254e-01,5.000000e-01,0,False,False,0.626224
101372,-73.987473,40.765750,-73.994625,40.750508,-0.707107,7.071068e-01,0.000000,1.000000,8.660254e-01,5.000000e-01,0,True,False,1.798704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124808,-73.989563,40.733330,-73.865891,40.771141,-0.866025,-5.000000e-01,-0.781831,0.623490,1.000000e+00,6.123234e-17,1,False,True,11.233938
107943,-73.981292,40.780722,-73.961571,40.798521,-1.000000,-1.836970e-16,0.781831,0.623490,5.000000e-01,-8.660254e-01,0,True,False,2.583315
137329,-73.985855,40.767102,-73.981375,40.764977,0.866025,-5.000000e-01,-0.433884,-0.900969,-5.000000e-01,8.660254e-01,0,False,True,0.445177
152909,-73.954927,40.773488,-73.979781,40.771111,0.258819,-9.659258e-01,-0.974928,-0.222521,8.660254e-01,-5.000000e-01,1,False,True,2.109557


In [422]:
# escalamos las variables numéricas, aplicamos StandardScaler, fit_transform al train y transform al test
# Como la regularización de Ridge y Lasso operan sobre las magnitudes de los parámetros, 
# es necesario estandarizar las
# features para que tengan igual rango de escala.


columnas_a_escalar = [
    'pickup_longitude',
    'pickup_latitude', 
    'dropoff_longitude',
    'dropoff_latitude',
    'distance'
]
columnas_no_escalar = [
    'hour_sin', 'hour_cos',           # Cíclicas: entre -1 y 1
    'day_of_week_sin', 'day_of_week_cos',  # Cíclicas: entre -1 y 1
    'month_sin', 'month_cos',         # Cíclicas: entre -1 y 1
    'is_weekend',                     # Dummy: 0 o 1
    'demand_high', 'demand_medium'    # Dummies: 0 o 1
]

X_train_to_scale = X_train[columnas_a_escalar]
X_train_no_scale = X_train[columnas_no_escalar]

X_test_to_scale = X_test[columnas_a_escalar] 
X_test_no_scale = X_test[columnas_no_escalar]

scaler = StandardScaler()
X_train_scaled_part = scaler.fit_transform(X_train_to_scale)
X_test_scaled_part = scaler.transform(X_test_to_scale)
# Convertir a DataFrame
X_train_scaled_part = pd.DataFrame(X_train_scaled_part, 
                                  columns=columnas_a_escalar,
                                  index=X_train.index)
X_test_scaled_part = pd.DataFrame(X_test_scaled_part, 
                                 columns=columnas_a_escalar,
                                 index=X_test.index)
# Recombinar
X_train_final = pd.concat([X_train_scaled_part, X_train_no_scale], axis=1)
X_test_final = pd.concat([X_test_scaled_part, X_test_no_scale], axis=1)

# Reordenar columnas para mantener el orden original
X_train_final = X_train_final[X_train.columns]
X_test_final = X_test_final[X_test.columns]

In [427]:
# X_train_final.head(3) con ordenado con pick_up_longitude descendente
X_train_final.sort_values(by='pickup_longitude', ascending=False).head(2)



Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos,is_weekend,demand_high,demand_medium,distance
178847,8.096652,0.046481,8.190129,0.581352,-0.866025,0.5,0.433884,-0.900969,-0.5,0.866025,0,True,False,-0.418807
198877,7.730714,-2.039537,6.546089,-0.13581,-0.707107,-0.707107,0.433884,-0.900969,-0.866025,0.5,0,False,True,0.904652


In [428]:
# Esto debería confirmar que el escalado funcionó:
print("=== VERIFICACIÓN DEL ESCALADO ===")
for col in columnas_a_escalar:
    media = X_train_final[col].mean()
    std = X_train_final[col].std()
    print(f"{col}:")
    print(f"  Media: {media:.6f} (debería ser ≈ 0)")
    print(f"  Std: {std:.6f} (debería ser ≈ 1)")
    print()

# Rangos de las variables no escaladas
print("=== VARIABLES NO ESCALADAS ===")
for col in columnas_no_escalar:
    min_val = X_train_final[col].min()
    max_val = X_train_final[col].max()
    print(f"{col}: rango [{min_val:.3f}, {max_val:.3f}]")

=== VERIFICACIÓN DEL ESCALADO ===
pickup_longitude:
  Media: -0.000000 (debería ser ≈ 0)
  Std: 1.000003 (debería ser ≈ 1)

pickup_latitude:
  Media: 0.000000 (debería ser ≈ 0)
  Std: 1.000003 (debería ser ≈ 1)

dropoff_longitude:
  Media: 0.000000 (debería ser ≈ 0)
  Std: 1.000003 (debería ser ≈ 1)

dropoff_latitude:
  Media: -0.000000 (debería ser ≈ 0)
  Std: 1.000003 (debería ser ≈ 1)

distance:
  Media: 0.000000 (debería ser ≈ 0)
  Std: 1.000003 (debería ser ≈ 1)

=== VARIABLES NO ESCALADAS ===
hour_sin: rango [-1.000, 1.000]
hour_cos: rango [-1.000, 1.000]
day_of_week_sin: rango [-0.975, 0.975]
day_of_week_cos: rango [-0.901, 1.000]
month_sin: rango [-1.000, 1.000]
month_cos: rango [-1.000, 1.000]
is_weekend: rango [0.000, 1.000]
demand_high: rango [0.000, 1.000]
demand_medium: rango [0.000, 1.000]


In [424]:
# Verificar la distribución de coordenadas
for col in ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']:
    print(f"\n{col}:")
    print(f"  Min: {df[col].min()}")
    print(f"  Max: {df[col].max()}")
    print(f"  Mediana: {df[col].median()}")
    print(f"  Percentil 99: {df[col].quantile(0.99)}")
    print(f"  Percentil 1: {df[col].quantile(0.01)}")


pickup_longitude:
  Min: -74.299012
  Max: -73.702735
  Mediana: -73.982157
  Percentil 99: -73.78817323
  Percentil 1: -74.01418477

pickup_latitude:
  Min: 40.508697
  Max: 40.89714
  Mediana: 40.753332
  Percentil 99: 40.80570954
  Percentil 1: 40.645628

dropoff_longitude:
  Min: -74.29983299999999
  Max: -73.70065
  Mediana: -73.98059872485351
  Percentil 99: -73.81210409
  Percentil 1: -74.01502259

dropoff_latitude:
  Min: 40.511859
  Max: 40.8999137878418
  Mediana: 40.753772
  Percentil 99: 40.8264518
  Percentil 1: 40.64705823
