<a href="https://colab.research.google.com/github/mate38a/ProyectoSustitutoModelos1/blob/main/fase_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# **Cargar Datos**

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# **Exploración inicial**

In [4]:
print(train.head())
print(train.info())
print(train.describe())

          id  vendor_id      pickup_datetime     dropoff_datetime  \
0  id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
1  id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
2  id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
3  id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
4  id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude store_and_fwd_flag  trip_duration  
0         40.765602                  N            455  
1         40.731

# **Limpieza de datos**

In [3]:
train = train[(train['trip_duration'] > 60) & (train['trip_duration'] < 7200)]

# **Feature engineering**

In [5]:
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return 6371 * 2 * np.arcsin(np.sqrt(a))

train['distance'] = haversine(
    train['pickup_longitude'], train['pickup_latitude'],
    train['dropoff_longitude'], train['dropoff_latitude']
)

# **Selección de características**

In [6]:
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'distance']
X = train[features]
y = train['trip_duration']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# **División en train/test**

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# **Modelo: Random Forest**

In [8]:
model = RandomForestRegressor(
    n_estimators=30,  # Reducido para mayor velocidad
    max_depth=8,
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

# **Predicciones y evaluación**

In [9]:
preds = model.predict(X_val)
print(f'RMSE: {np.sqrt(mean_squared_error(y_val, preds)):.0f} segundos')

RMSE: 380 segundos


# **Guardar modelo**

In [10]:
joblib.dump(model, 'modelo_fase1.pkl')
print("¡Modelo guardado!")

¡Modelo guardado!
