In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv(r'/home/long/longdata/kaggle compe/dataset/num_df.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 16 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   vendor_id              1458644 non-null  int64  
 1   passenger_count        1458644 non-null  int64  
 2   pickup_longitude       1458644 non-null  float64
 3   pickup_latitude        1458644 non-null  float64
 4   dropoff_longitude      1458644 non-null  float64
 5   dropoff_latitude       1458644 non-null  float64
 6   store_and_fwd_flag     1458644 non-null  int64  
 7   trip_duration          1458644 non-null  int64  
 8   haversine_km           1458644 non-null  float64
 9   bearing                1458644 non-null  float64
 10  trip_duration_minutes  1458644 non-null  float64
 11  log_trip_duration      1458644 non-null  float64
 12  avg_speed_kph          1458644 non-null  float64
 13  is_zero_distance       1458644 non-null  bool   
 14  pickup_cluster    

In [6]:
x = df.drop(['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'trip_duration'], axis=1)
y = df['trip_duration']
print(x.columns)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=42)

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'haversine_km', 'bearing', 'trip_duration_minutes',
       'log_trip_duration', 'avg_speed_kph', 'is_zero_distance',
       'pickup_cluster', 'dropoff_cluster'],
      dtype='object')


In [11]:
def rmsle(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    y_true = tf.clip_by_value(y_true, 1e-7, tf.reduce_max(y_true))
    y_pred = tf.clip_by_value(y_pred, 1e-7, tf.reduce_max(y_pred))
    return tf.sqrt(tf.reduce_mean(tf.square(tf.math.log(y_pred + 1.0) - tf.math.log(y_true + 1.0))))

In [8]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [14]:
model = keras.Sequential([
    layers.Input(shape=(12,)), 

    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),

    layers.Dense(1)
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=rmsle,
    metrics=[rmsle]
)

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

In [15]:
print("x_train_scaled shape:", x_train_scaled.shape)
print("y_train shape:", y_train.shape)


x_train_scaled shape: (1444057, 12)
y_train shape: (1444057,)


In [None]:
history = model.fit(
    x_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=256,
    callbacks=[early_stopping],
    verbose=1
)

loss, rmsle_score = model.evaluate(x_test_scaled, y_test, verbose=1)
print("Final RMSLE on test set:", rmsle_score)

Epoch 1/100


I0000 00:00:1744999507.093724 1253708 service.cc:152] XLA service 0x7f3590002420 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1744999507.093763 1253708 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2025-04-19 01:05:07.162165: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1744999507.408553 1253708 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  27/4513[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17s[0m 4ms/step - loss: 5.9807 - rmsle: 5.9807 

I0000 00:00:1744999509.158978 1253708 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m4513/4513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 5ms/step - loss: 1.0930 - rmsle: 1.0930 - val_loss: 0.0778 - val_rmsle: 0.0778
Epoch 2/100
[1m4501/4513[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - loss: 0.1450 - rmsle: 0.1450

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training RMSLE')
plt.plot(history.history['val_loss'], label='Validation RMSLE')
plt.title('Model RMSLE Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('RMSLE')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
loss, rmsle_score = model.evaluate(x_test_scaled, y_test, verbose=1)
print("Test RMSLE:", rmsle_score)