In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# 1. CARGA Y EXPLORACIÓN INICIAL DEL DATASET
# ============================================================================
print("="*60)
print("1. CARGA Y EXPLORACIÓN INICIAL")
print("="*60)

# Cargar datos
df = pd.read_csv('uber_fares.csv')
print(f"Shape del dataset: {df.shape}")
print("\nPrimeras 5 filas:")
print(df.head())
print("\nInformación del dataset:")
print(df.info())
print("\nEstadísticas descriptivas:")
print(df.describe())

# TODO: Agregar análisis exploratorio más detallado
# - Verificar valores nulos
print("\nValores nulos por columna:")
print(df.isnull().sum())
print("\nEliminado de valores faltantes:")
df.dropna(inplace=True)
print(f"Nuevo shape del dataset: {df.shape}")
# - Detectar outliers
# - Visualizaciones iniciales

# ============================================================================
# 2. PROCESAMIENTO Y FEATURE ENGINEERING
# ============================================================================
print("\n" + "="*60)
print("2. PROCESAMIENTO Y FEATURE ENGINEERING")
print("="*60)

# 2.1 Procesamiento de fechas (usando nuestro código anterior)
df['date'] = pd.to_datetime(df['pickup_datetime']).dt.floor('s')

# Variables temporales básicas
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month

# Codificación cíclica
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Variables de contexto
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Análisis de demanda por hora
hourly_trip_counts = df.groupby('hour').size().reset_index(name='trips_per_hour')
df = df.merge(hourly_trip_counts, on='hour', how='left')

q33 = hourly_trip_counts['trips_per_hour'].quantile(0.33)
q66 = hourly_trip_counts['trips_per_hour'].quantile(0.66)

def classify_demand(trips):
    if trips <= q33:
        return 'low'
    elif trips <= q66:
        return 'medium'
    else:
        return 'high'

df['demand_level'] = df['trips_per_hour'].apply(classify_demand)
demand_dummies = pd.get_dummies(df['demand_level'], prefix='demand')
df = pd.concat([df, demand_dummies], axis=1)

df['is_peak_hour'] = df['trips_per_hour'] > q66

# 2.2 Cálculo de distancia (Fórmula de Haversine)
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calcula la distancia entre dos puntos en la Tierra usando la fórmula de Haversine
    """
    R = 6371  # Radio de la Tierra en km
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    
    return distance

df['distance'] = haversine_distance(
    df['pickup_latitude'], df['pickup_longitude'],
    df['dropoff_latitude'], df['dropoff_longitude']
)

# 2.3 Selección de features finales
final_features = [
    'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude',
    'distance', 'passenger_count',
    'hour_sin', 'hour_cos',
    'day_of_week_sin', 'day_of_week_cos',
    'month_sin', 'month_cos',
    'is_weekend', 'is_peak_hour',
    'demand_high', 'demand_medium'
]

print(f"Features seleccionadas: {len(final_features)}")
print(final_features)

# ============================================================================
# 3. DIVISIÓN DEL DATASET Y ESCALADO
# ============================================================================
print("\n" + "="*60)
print("3. DIVISIÓN DEL DATASET Y ESCALADO")
print("="*60)

# Preparar X y y
X = df[final_features].copy()
y = df['fare_amount'].copy()

# División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Escalado
variables_to_scale = [
    'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude', 
    'distance', 'passenger_count'
]

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[variables_to_scale] = scaler.fit_transform(X_train[variables_to_scale])
X_test_scaled[variables_to_scale] = scaler.transform(X_test[variables_to_scale])

print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")

1. CARGA Y EXPLORACIÓN INICIAL
Shape del dataset: (200000, 9)

Primeras 5 filas:
        key                           date  fare_amount  \
0  24238194    2015-05-07 19:52:06.0000003          7.5   
1  27835199    2009-07-17 20:04:56.0000002          7.7   
2  44984355   2009-08-24 21:45:00.00000061         12.9   
3  25894730    2009-06-26 08:22:21.0000001          5.3   
4  17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  

In [6]:
df

Unnamed: 0,key,date,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,...,month_sin,month_cos,is_weekend,trips_per_hour,demand_level,demand_high,demand_low,demand_medium,is_peak_hour,distance
0,24238194,2015-05-07 19:52:06+00:00,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,19,...,8.660254e-01,-5.000000e-01,0,12605,high,True,False,False,True,1.683323
1,27835199,2009-07-17 20:04:56+00:00,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1,20,...,1.224647e-16,-1.000000e+00,0,11755,high,True,False,False,True,2.457590
2,44984355,2009-08-24 21:45:00+00:00,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1,21,...,-5.000000e-01,-8.660254e-01,0,11446,high,True,False,False,True,5.036377
3,25894730,2009-06-26 08:22:21+00:00,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,8,...,5.000000e-01,-8.660254e-01,0,9075,medium,False,False,True,False,1.661683
4,17610152,2014-08-28 17:47:00+00:00,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,17,...,-5.000000e-01,-8.660254e-01,0,9758,medium,False,False,True,False,4.475450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199994,42598914,2012-10-28 10:49:00+00:00,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1,10,...,-1.000000e+00,-1.836970e-16,1,8944,medium,False,False,True,False,0.112210
199995,16382965,2014-03-14 01:09:00+00:00,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1,1,...,8.660254e-01,5.000000e-01,0,5908,low,False,True,False,False,1.875050
199996,27804658,2009-06-29 00:42:00+00:00,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2,0,...,5.000000e-01,-8.660254e-01,0,7844,low,False,True,False,False,12.850319
199997,20259894,2015-05-20 14:56:25+00:00,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1,14,...,8.660254e-01,-5.000000e-01,0,9749,medium,False,False,True,False,3.539715


ejemplo de knn