In [2]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
val = pd.read_csv('val.csv')
print(train.info())
print(train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1914056 entries, 0 to 1914055
Data columns (total 45 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Unique ID                        int64  
 1   Rider_ID                         int64  
 2   category_x                       object 
 3   Circuit_Length_km                float64
 4   Laps                             int64  
 5   Grid_Position                    int64  
 6   Avg_Speed_kmh                    float64
 7   Track_Condition                  object 
 8   Humidity_%                       int64  
 9   Tire_Compound_Front              object 
 10  Tire_Compound_Rear               object 
 11  Penalty                          object 
 12  Championship_Points              int64  
 13  Championship_Position            int64  
 14  Session                          object 
 15  year_x                           int64  
 16  sequence                         int64  
 17  rider   

In [3]:
# Check for missing values
print(train.isnull().sum())

# Get descriptive statistics
print(train.describe(include='all'))  # Include all columns for a complete overview
# Fill missing values in the 'Penalty' column
train['Penalty'] = train['Penalty'].fillna('No Penalty')  # or use 0 if it's numerical


Unique ID                               0
Rider_ID                                0
category_x                              0
Circuit_Length_km                       0
Laps                                    0
Grid_Position                           0
Avg_Speed_kmh                           0
Track_Condition                         0
Humidity_%                              0
Tire_Compound_Front                     0
Tire_Compound_Rear                      0
Penalty                            321292
Championship_Points                     0
Championship_Position                   0
Session                                 0
year_x                                  0
sequence                                0
rider                                   0
team                                    0
bike                                    0
position                                0
points                                  0
shortname                               0
circuit_name                      

In [5]:
# Define the function to create new features
import numpy as np
def create_features(df):
    df = df.copy()
    
    # Example features
    df['Lap_Density'] = df['Laps'] / df['Circuit_Length_km'].replace(0, np.nan)  # Avoid division by zero
    df['Grid_Championship_Diff'] = df['Grid_Position'] - df['Championship_Position']
    df['Total_Tire_Wear'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps']
    
    # Use the correct column names
    df['Win_Rate'] = df['wins'] / df['starts'].replace(0, np.nan)  # Corrected to 'starts'
    df['Podium_Rate'] = df['podiums'] / df['starts'].replace(0, np.nan)  # Corrected to 'starts'
    
    # Replace infinite values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df

# Apply feature creation
train = create_features(train)
val = create_features(val)
test = create_features(test)
# Display the first few rows of the modified DataFrame
print("Train DataFrame after feature creation:")
print(train.head())
print("\nValidation DataFrame after feature creation:")
print(val.head())
print("\nTest DataFrame after feature creation:")
print(test.head())

Train DataFrame after feature creation:
   Unique ID  Rider_ID category_x  Circuit_Length_km  Laps  Grid_Position  \
0    1894944      2659      Moto2              4.874    22             17   
1      23438      5205      Moto2              3.875    24              7   
2     939678      7392      Moto3              5.647    25              5   
3    1196312      7894      Moto3              4.810    19              3   
4    1033899      6163     MotoGP              5.809    25             21   

   Avg_Speed_kmh Track_Condition  Humidity_% Tire_Compound_Front  ... podiums  \
0         264.66             Wet          61                Hard  ...       4   
1         177.56             Wet          77                Soft  ...       2   
2         317.74             Dry          87                Soft  ...       0   
3         321.82             Wet          43                Soft  ...      16   
4         239.92             Wet          47                Hard  ...      29   

  wins  mi

In [8]:
# Make sure this cell is run BEFORE using train_encoded
from sklearn.preprocessing import LabelEncoder
def encode_categoricals(df):
    df = df.copy()
    
    # One-hot encoding for low cardinality categorical features
    one_hot_cols = [
        'category_x', 'Track_Condition', 'Tire_Compound_Front', 
        'Tire_Compound_Rear', 'Session', 'weather'
    ]
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
    
    # Label encoding for high cardinality categorical features
    label_cols = ['circuit_name', 'rider_name', 'team_name', 'bike_name']
    le = LabelEncoder()
    for col in label_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    
    return df

train_encoded = encode_categoricals(train)
val_encoded = encode_categoricals(val)
test_encoded = encode_categoricals(test)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Function to clean and encode categorical data
def clean_and_encode_data(df):
    """Clean dataframe by encoding categorical variables and converting numeric strings"""
    from sklearn.preprocessing import LabelEncoder
    
    df_cleaned = df.copy()
    label_encoders = {}
    
    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object':
            print(f"Processing categorical column: {col}")
            print(f"Unique values: {df_cleaned[col].unique()}")
            
            # Special handling for Penalty column (contains time penalties)
            if col == 'Penalty':
                # Create a numeric representation of penalties
                penalty_mapping = {
                    'No Penalty': 0,
                    '+3s': 3,
                    '+5s': 5,
                    '+10s': 10,
                    '+15s': 15,
                    '+20s': 20,
                    '+30s': 30,
                    'Ride Through': 25,  # Approximate time penalty
                    'DNF': 999,  # Large penalty for Did Not Finish
                    'DNS': 998,  # Large penalty for Did Not Start
                    'DSQ': 997   # Large penalty for Disqualified
                }
                
                # Apply mapping, use label encoding for unmapped values
                df_cleaned[col] = df_cleaned[col].map(penalty_mapping)
                unmapped_mask = df_cleaned[col].isna()
                
                if unmapped_mask.any():
                    print(f"Found unmapped penalty values, using label encoding for: {df[col][unmapped_mask].unique()}")
                    le = LabelEncoder()
                    # Fill unmapped values with original strings temporarily
                    df_cleaned.loc[unmapped_mask, col] = df.loc[unmapped_mask, col]
                    # Encode all values
                    df_cleaned[col] = le.fit_transform(df_cleaned[col].astype(str))
                    label_encoders[col] = le
                
                print(f"Encoded {col} with penalty values")
            
            else:
                # Use label encoding for other categorical columns
                le = LabelEncoder()
                df_cleaned[col] = le.fit_transform(df_cleaned[col].astype(str))
                label_encoders[col] = le
                print(f"Label encoded column: {col}")
    
    return df_cleaned, label_encoders

# Prepare features and target
X_train = train_encoded.drop(columns=['Lap_Time_Seconds'])
y_train = train_encoded['Lap_Time_Seconds']
X_val = val_encoded.drop(columns=['Lap_Time_Seconds'])
y_val = val_encoded['Lap_Time_Seconds']

# Check shapes to confirm
print("Original shapes:")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

# Check data types in X_train
print("\nData types in X_train:")
print(X_train.dtypes.value_counts())

# Check for non-numeric columns in features
object_cols = X_train.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"\nObject columns found in features: {list(object_cols)}")
    
    # Show sample values from object columns
    for col in object_cols:
        print(f"Sample values in {col}: {X_train[col].unique()[:10]}")

# Clean and encode the feature matrices
print("\nCleaning and encoding feature matrices...")
X_train_cleaned, train_encoders = clean_and_encode_data(X_train)
X_val_cleaned, val_encoders = clean_and_encode_data(X_val)

# Function to clean the target variable
def clean_target(y):
    """Clean target variable by removing unwanted characters and converting to float"""
    return y.astype(str).str.replace('+', '').str.replace('s', '').astype(float)

# Check for non-numeric values in y_train
print("\nChecking target variable...")
non_numeric_mask = ~pd.to_numeric(y_train, errors='coerce').notna()
non_numeric_values = y_train[non_numeric_mask]

# Print any non-numeric values found
if not non_numeric_values.empty:
    print("Non-numeric values in y_train:")
    print(non_numeric_values.head(10))
    print(f"Total non-numeric values: {len(non_numeric_values)}")
    
    # Clean the target variables
    y_train_cleaned = clean_target(y_train)
    y_val_cleaned = clean_target(y_val)
else:
    print("No non-numeric values found in target variable")
    # If no cleaning needed, use original target
    y_train_cleaned = y_train.astype(float)
    y_val_cleaned = y_val.astype(float)

# Final check - ensure all data is numeric
print("\nFinal data type check:")
print("X_train_cleaned dtypes:")
print(X_train_cleaned.dtypes.value_counts())

# Verify all columns are now numeric
non_numeric_features = X_train_cleaned.select_dtypes(include=['object']).columns
if len(non_numeric_features) > 0:
    print(f"\nError: Still have non-numeric columns: {list(non_numeric_features)}")
    # Convert any remaining object columns to numeric if possible
    for col in non_numeric_features:
        try:
            X_train_cleaned[col] = pd.to_numeric(X_train_cleaned[col], errors='coerce')
            X_val_cleaned[col] = pd.to_numeric(X_val_cleaned[col], errors='coerce')
            print(f"Converted {col} to numeric")
        except:
            print(f"Could not convert {col} to numeric - this will cause an error")
else:
    print("✓ All feature columns are now numeric!")

# Verify no NaN values were introduced
print(f"\nNaN values in X_train_cleaned: {X_train_cleaned.isnull().sum().sum()}")
print(f"NaN values in y_train_cleaned: {y_train_cleaned.isnull().sum()}")

# Handle any NaN values if they exist
if X_train_cleaned.isnull().sum().sum() > 0:
    print("Filling NaN values with median...")
    X_train_cleaned = X_train_cleaned.fillna(X_train_cleaned.median())
    X_val_cleaned = X_val_cleaned.fillna(X_train_cleaned.median())  # Use training median for validation

if y_train_cleaned.isnull().sum() > 0:
    print("Removing rows with NaN target values...")
    valid_idx = ~y_train_cleaned.isnull()
    X_train_cleaned = X_train_cleaned[valid_idx]
    y_train_cleaned = y_train_cleaned[valid_idx]

# Initialize model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train model using the cleaned data
print("\nTraining model...")
model.fit(X_train_cleaned, y_train_cleaned)

# Predict on validation set
y_pred = model.predict(X_val_cleaned)

# Calculate RMSE using the cleaned validation target
rmse = mean_squared_error(y_val_cleaned, y_pred, squared=False)
print(f"\nValidation RMSE: {rmse:.4f}")

# Additional metrics
mae = np.mean(np.abs(y_val_cleaned - y_pred))
print(f"Validation MAE: {mae:.4f}")

# Show some predictions vs actual
print(f"\nSample predictions vs actual:")
comparison = pd.DataFrame({
    'Actual': y_val_cleaned.iloc[:10].values,
    'Predicted': y_pred[:10],
    'Difference': (y_val_cleaned.iloc[:10].values - y_pred[:10])
})
print(comparison)

Original shapes:
X_train shape: (1914056, 60)
y_train shape: (1914056,)
X_val shape: (273437, 60)
y_val shape: (273437,)

Data types in X_train:
int64      29
bool       17
float64    11
object      3
Name: count, dtype: int64

Object columns found in features: ['Penalty', 'shortname', 'track']
Sample values in Penalty: ['+3s' '+5s' 'DNF' 'DNS' 'Ride Through' 'No Penalty']
Sample values in shortname: ['EMI' 'NAT' 'SPA' 'AUS' 'CAT' 'VAL' 'MAL' 'RSA' 'JPN' 'WGER']
Sample values in track: ['Dry' 'Wet']

Cleaning and encoding feature matrices...
Processing categorical column: Penalty
Unique values: ['+3s' '+5s' 'DNF' 'DNS' 'Ride Through' 'No Penalty']
Encoded Penalty with penalty values
Processing categorical column: shortname
Unique values: ['EMI' 'NAT' 'SPA' 'AUS' 'CAT' 'VAL' 'MAL' 'RSA' 'JPN' 'WGER' 'FRA' 'NED'
 'EGER' 'AME' 'GBR' 'RIO' 'ARG' 'ITA' 'TCH' 'CZE' 'EUR' 'AUT' 'RSM' 'SWE'
 'BEL' 'GER' 'POR' 'JUG' 'INP' 'CHN' 'ULST' 'VDU' 'FIN' 'BRA' 'QAT' 'ARA'
 'USA' 'INA' 'TT' 'VEN' 'ANC' 

  df_cleaned.loc[unmapped_mask, col] = df.loc[unmapped_mask, col]


Encoded Penalty with penalty values
Processing categorical column: shortname
Unique values: ['EGER' 'POR' 'JPN' 'NED' 'CZE' 'GBR' 'QAT' 'CAT' 'FRA' 'EUR' 'ITA' 'WGER'
 'FIN' 'SPA' 'VAL' 'RSM' 'ARG' 'BEL' 'USA' 'MAL' 'JUG' 'STY' 'SWE' 'RSA'
 'AUS' 'AUT' 'TCH' 'NAT' 'ARA' 'INP' 'GER' 'ULST' 'CAN' 'HUN' 'RIO' 'TUR'
 'PAC' 'EMI' 'TT' 'IMO' 'AME' 'CHN' 'BRA' 'VEN' 'MAD' 'SWI' 'TER' 'INA'
 'FIM' 'DOH' 'THA' 'ANC' 'VDU']
Label encoded column: shortname
Processing categorical column: track
Unique values: ['Dry' 'Wet']
Label encoded column: track

Checking target variable...
No non-numeric values found in target variable

Final data type check:
X_train_cleaned dtypes:
int64      32
bool       17
float64    11
Name: count, dtype: int64
✓ All feature columns are now numeric!

NaN values in X_train_cleaned: 0
NaN values in y_train_cleaned: 0

Training model...


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

print("=== MotoGP Lap Time Prediction Pipeline ===")

# Step 1: Load data
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')

print(f"Train shape: {train.shape}")
print(f"Val shape: {val.shape}")

# Use a subset of training data for faster hyperparameter tuning
train_sample = train.sample(frac=0.1, random_state=42)

# Step 2: Feature Engineering
def create_features(df):
    df = df.copy()
    if 'Penalty' in df.columns:
        df['Penalty'] = df['Penalty'].fillna("No Penalty")
    df['Lap_Density'] = df['Laps'] / df['Circuit_Length_km'].replace(0, np.nan)
    df['Grid_Championship_Diff'] = df['Grid_Position'] - df['Championship_Position']
    df['Total_Tire_Wear'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps']
    df['Win_Rate'] = df['wins'] / df['starts'].replace(0, np.nan)
    df['Podium_Rate'] = df['podiums'] / df['starts'].replace(0, np.nan)
    df['Temp_Diff'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
    df['Humidity_Temp'] = df['Humidity_%'] * df['Ambient_Temperature_Celsius']
    df['Is_Race_Session'] = df['Session'].str.lower().eq('race').astype(int)
    df['Career_Progress'] = df['year_x'] - df['min_year']
    df['Weather_TrackCond'] = df['weather'].astype(str) + '_' + df['Track_Condition'].astype(str)
    return df

train_sample = create_features(train_sample)
val = create_features(val)

# Step 3: Encode categorical features
def encode_data(train_df, val_df):
    train_df = train_df.copy()
    val_df = val_df.copy()
    one_hot_cols = ['category_x', 'Track_Condition', 'Tire_Compound_Front',
                    'Tire_Compound_Rear', 'Session', 'weather']
    label_cols = ['circuit_name', 'rider_name', 'team_name', 'bike_name', 'Penalty', 'Weather_TrackCond']

    # Remove high-cardinality object columns that weren't properly encoded
    drop_cols = ['track', 'shortname']
    for col in drop_cols:
        if col in train_df.columns:
            train_df.drop(columns=[col], inplace=True)
        if col in val_df.columns:
            val_df.drop(columns=[col], inplace=True)

    combined = pd.concat([train_df[one_hot_cols], val_df[one_hot_cols]])
    combined_encoded = pd.get_dummies(combined, drop_first=True)
    train_dummies = combined_encoded.iloc[:len(train_df)]
    val_dummies = combined_encoded.iloc[len(train_df):]
    train_df = train_df.drop(columns=one_hot_cols).reset_index(drop=True)
    val_df = val_df.drop(columns=one_hot_cols).reset_index(drop=True)
    train_df = pd.concat([train_df, train_dummies.reset_index(drop=True)], axis=1)
    val_df = pd.concat([val_df, val_dummies.reset_index(drop=True)], axis=1)

    for col in label_cols:
        if col in train_df.columns:
            le = LabelEncoder()
            all_vals = pd.concat([train_df[col].astype(str), val_df[col].astype(str)])
            le.fit(all_vals)
            train_df[col] = le.transform(train_df[col].astype(str))
            val_df[col] = le.transform(val_df[col].astype(str))
    return train_df, val_df

train_sample, val_encoded = encode_data(train_sample, val)

# Step 4: Fill missing values
def fill_missing(df, ref=None):
    df = df.copy()
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            fill_val = ref[col].median() if ref is not None and col in ref.columns else df[col].median()
            df[col] = df[col].fillna(fill_val)
        else:
            df[col] = df[col].fillna("Unknown")
    return df

train_sample = fill_missing(train_sample)
val_encoded = fill_missing(val_encoded, train_sample)

# Step 5: Prepare features and target
target = 'Lap_Time_Seconds'
X_train_sample = train_sample.drop(columns=[target])
y_train_sample = train_sample[target]
X_val = val_encoded.drop(columns=[target])
y_val = val_encoded[target]

common_features = sorted(list(set(X_train_sample.columns).intersection(set(X_val.columns))))
X_train_sample = X_train_sample[common_features]
X_val = X_val[common_features]

# Step 6: Hyperparameter Tuning
param_grid = {
    'learning_rate': [0.01, 0.03, 0.05],
    'max_iter': [100, 300, 500],
    'max_depth': [5, 7, 10],
    'min_samples_leaf': [10, 20, 50],
    'l2_regularization': [0.0, 0.1, 1.0]
}

print("\nRunning hyperparameter tuning...")
base_model = HistGradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_grid,
    n_iter=10,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_sample, y_train_sample)
print("Best hyperparameters:", random_search.best_params_)

# Step 7: Train model with best parameters
model = random_search.best_estimator_
model.fit(X_train_sample, y_train_sample)

# Step 8: Evaluate
y_val_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")

# Step 9: Save submission
print("\nSaving predictions to submission1.csv...")
if 'Unique ID' in val.columns:
    submission = pd.DataFrame({
        'Unique ID': val['Unique ID'],
        'Lap_Time_Seconds': y_val_pred
    })
    submission.to_csv("submission1.csv", index=False)
    print("✅ submission1.csv created successfully.")
else:
    print("❌ 'Unique ID' column not found in validation set.")

print("\n=== Pipeline completed successfully! ===")


=== MotoGP Lap Time Prediction Pipeline ===
Train shape: (1914056, 45)
Val shape: (273437, 45)

Running hyperparameter tuning...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best hyperparameters: {'min_samples_leaf': 10, 'max_iter': 500, 'max_depth': 7, 'learning_rate': 0.05, 'l2_regularization': 0.1}
Validation RMSE: 10.7929

Saving predictions to submission1.csv...
✅ submission1.csv created successfully.

=== Pipeline completed successfully! ===


In [4]:
print(val.columns)


Index(['Unique ID', 'Rider_ID', 'category_x', 'Circuit_Length_km', 'Laps',
       'Grid_Position', 'Avg_Speed_kmh', 'Track_Condition', 'Humidity_%',
       'Tire_Compound_Front', 'Tire_Compound_Rear', 'Penalty',
       'Championship_Points', 'Championship_Position', 'Session', 'year_x',
       'sequence', 'rider', 'team', 'bike', 'position', 'points', 'shortname',
       'circuit_name', 'rider_name', 'team_name', 'bike_name',
       'Lap_Time_Seconds', 'Corners_per_Lap',
       'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds',
       'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'weather',
       'track', 'air', 'ground', 'starts', 'finishes', 'with_points',
       'podiums', 'wins', 'min_year', 'max_year', 'years_active',
       'Lap_Density', 'Grid_Championship_Diff', 'Total_Tire_Wear', 'Win_Rate',
       'Podium_Rate', 'Temp_Diff', 'Humidity_Temp', 'Is_Race_Session',
       'Career_Progress', 'Weather_TrackCond'],
      dtype='object')


In [4]:
import pandas as pd

# Load the train dataset
train = pd.read_csv('train.csv')

# Print the names of all columns
print("Columns in train.csv:")
print(train.columns.tolist())


Columns in train.csv:
['Unique ID', 'Rider_ID', 'category_x', 'Circuit_Length_km', 'Laps', 'Grid_Position', 'Avg_Speed_kmh', 'Track_Condition', 'Humidity_%', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Penalty', 'Championship_Points', 'Championship_Position', 'Session', 'year_x', 'sequence', 'rider', 'team', 'bike', 'position', 'points', 'shortname', 'circuit_name', 'rider_name', 'team_name', 'bike_name', 'Lap_Time_Seconds', 'Corners_per_Lap', 'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds', 'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'weather', 'track', 'air', 'ground', 'starts', 'finishes', 'with_points', 'podiums', 'wins', 'min_year', 'max_year', 'years_active']


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')

print("=== MotoGP Lap Time Prediction Pipeline ===")

# Step 1: Load data
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')

print(f"Train shape: {train.shape}")
print(f"Val shape: {val.shape}")

# Use a subset of training data for faster hyperparameter tuning
train_sample = train.sample(frac=0.1, random_state=42)

# Step 2: Feature Engineering
def create_features(df):
    df = df.copy()
    if 'Penalty' in df.columns:
        df['Penalty'] = df['Penalty'].fillna("No Penalty")
    df['Lap_Density'] = df['Laps'] / df['Circuit_Length_km'].replace(0, np.nan)
    df['Grid_Championship_Diff'] = df['Grid_Position'] - df['Championship_Position']
    df['Total_Tire_Wear'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps']
    df['Win_Rate'] = df['wins'] / df['starts'].replace(0, np.nan)
    df['Podium_Rate'] = df['podiums'] / df['starts'].replace(0, np.nan)
    df['Temp_Diff'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
    df['Humidity_Temp'] = df['Humidity_%'] * df['Ambient_Temperature_Celsius']
    df['Is_Race_Session'] = df['Session'].str.lower().eq('race').astype(int)
    df['Career_Progress'] = df['year_x'] - df['min_year']
    df['Weather_TrackCond'] = df['weather'].astype(str) + '_' + df['Track_Condition'].astype(str)
    return df

train_sample = create_features(train_sample)
val = create_features(val)

# Step 3: Encode categorical features
def encode_data(train_df, val_df):
    train_df = train_df.copy()
    val_df = val_df.copy()
    one_hot_cols = ['category_x', 'Track_Condition', 'Tire_Compound_Front',
                    'Tire_Compound_Rear', 'Session', 'weather']
    label_cols = ['circuit_name', 'rider_name', 'team_name', 'bike_name', 'Penalty', 'Weather_TrackCond']

    # Remove high-cardinality object columns that weren't properly encoded
    drop_cols = ['track', 'shortname']
    for col in drop_cols:
        if col in train_df.columns:
            train_df.drop(columns=[col], inplace=True)
        if col in val_df.columns:
            val_df.drop(columns=[col], inplace=True)

    combined = pd.concat([train_df[one_hot_cols], val_df[one_hot_cols]])
    combined_encoded = pd.get_dummies(combined, drop_first=True)
    train_dummies = combined_encoded.iloc[:len(train_df)]
    val_dummies = combined_encoded.iloc[len(train_df):]
    train_df = train_df.drop(columns=one_hot_cols).reset_index(drop=True)
    val_df = val_df.drop(columns=one_hot_cols).reset_index(drop=True)
    train_df = pd.concat([train_df, train_dummies.reset_index(drop=True)], axis=1)
    val_df = pd.concat([val_df, val_dummies.reset_index(drop=True)], axis=1)

    for col in label_cols:
        if col in train_df.columns:
            le = LabelEncoder()
            all_vals = pd.concat([train_df[col].astype(str), val_df[col].astype(str)])
            le.fit(all_vals)
            train_df[col] = le.transform(train_df[col].astype(str))
            val_df[col] = le.transform(val_df[col].astype(str))
    return train_df, val_df

train_sample, val_encoded = encode_data(train_sample, val)

# Step 4: Fill missing values
def fill_missing(df, ref=None):
    df = df.copy()
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            fill_val = ref[col].median() if ref is not None and col in ref.columns else df[col].median()
            df[col] = df[col].fillna(fill_val)
        else:
            df[col] = df[col].fillna("Unknown")
    return df

train_sample = fill_missing(train_sample)
val_encoded = fill_missing(val_encoded, train_sample)

# Step 5: Prepare features and target
target = 'Lap_Time_Seconds'
X_train_sample = train_sample.drop(columns=[target])
y_train_sample = train_sample[target]
X_val = val_encoded.drop(columns=[target])
y_val = val_encoded[target]

common_features = sorted(list(set(X_train_sample.columns).intersection(set(X_val.columns))))
X_train_sample = X_train_sample[common_features]
X_val = X_val[common_features]

# Step 6: Hyperparameter Tuning with broader search
param_grid = {
    'learning_rate': [0.01, 0.02, 0.03, 0.05],
    'max_iter': [300, 500, 700],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [5, 10, 20],
    'l2_regularization': [0.0, 0.1, 0.5, 1.0]
}

print("\nRunning hyperparameter tuning...")
base_model = HistGradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_grid,
    n_iter=25,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_sample, y_train_sample)
print("Best hyperparameters:", random_search.best_params_)

# Step 7: Train model with best parameters
model = random_search.best_estimator_
model.fit(X_train_sample, y_train_sample)

# Step 8: Evaluate
y_val_pred = model.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation RMSE: {rmse:.4f}")

# Step 9: Save submission
print("\nSaving predictions to submission1.csv...")
if 'Unique ID' in val.columns:
    submission = pd.DataFrame({
        'Unique ID': val['Unique ID'],
        'Lap_Time_Seconds': y_val_pred
    })
    submission.to_csv("submission1.csv", index=False)
    print("✅ submission1.csv created successfully.")
else:
    print("❌ 'Unique ID' column not found in validation set.")

print("\n=== Pipeline completed successfully! ===")

=== MotoGP Lap Time Prediction Pipeline ===
Train shape: (1914056, 45)
Val shape: (273437, 45)

Running hyperparameter tuning...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best hyperparameters: {'min_samples_leaf': 5, 'max_iter': 500, 'max_depth': 15, 'learning_rate': 0.05, 'l2_regularization': 1.0}
Validation RMSE: 10.7354

Saving predictions to submission1.csv...
✅ submission1.csv created successfully.

=== Pipeline completed successfully! ===


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import matplotlib.pyplot as plt

# Step 1: Load your datasets
train = pd.read_csv('train.csv')  # Update path as needed
val = pd.read_csv('val.csv')

# Step 2: Feature engineering function
def create_features(df):
    df = df.copy()
    df['Lap_Density'] = df['Laps'] / df['Circuit_Length_km'].replace(0, np.nan)
    df['Grid_Championship_Diff'] = df['Grid_Position'] - df['Championship_Position']
    df['Total_Tire_Wear'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps']
    df['Win_Rate'] = df['wins'] / df['starts'].replace(0, np.nan)
    df['Podium_Rate'] = df['podiums'] / df['starts'].replace(0, np.nan)
    df['Pit_Per_Lap'] = df['Pit_Stop_Duration_Seconds'] / df['Laps'].replace(0, np.nan)
    df['Temp_Diff'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
    df['Humidity_Temp'] = df['Humidity_%'] * df['Ambient_Temperature_Celsius']
    df['Is_Race_Session'] = df['Session'].str.lower().eq('race').astype(int)
    df['Career_Progress'] = df['year_x'] - df['min_year']
    df['Weather_TrackCond'] = df['weather'].astype(str) + '_' + df['Track_Condition'].astype(str)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

train = create_features(train)
val = create_features(val)

# Step 3: Encoding categorical variables
def encode_categoricals(df):
    df = df.copy()
    one_hot_cols = ['category_x', 'Track_Condition', 'Tire_Compound_Front', 'Tire_Compound_Rear', 'Session', 'weather', 'Weather_TrackCond']
    df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)
    label_cols = ['circuit_name', 'rider_name', 'team_name', 'bike_name', 'Penalty', 'shortname', 'track']
    for col in label_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    return df

train_encoded = encode_categoricals(train)
val_encoded = encode_categoricals(val)

# Step 4: Impute missing values
def fill_missing(df):
    df = df.copy()
    num_cols = df.select_dtypes(include='number').columns
    for col in num_cols:
        df[col].fillna(df[col].median(), inplace=True)
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col].fillna('Unknown', inplace=True)
    return df

train_encoded = fill_missing(train_encoded)
val_encoded = fill_missing(val_encoded)

# Step 5: Prepare features and target
X_train = train_encoded.drop(columns=['Lap_Time_Seconds'])
y_train = train_encoded['Lap_Time_Seconds']
X_val = val_encoded.drop(columns=['Lap_Time_Seconds'])
y_val = val_encoded['Lap_Time_Seconds']

# Step 6: Tune hyperparameters for Random Forest using GridSearchCV with larger folds
print("Tuning Random Forest parameters with larger folds...")
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [20, 40, None],
    'min_samples_split': [2, 5],
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)  # Increased cv to 5
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
print("Best RF params:", grid_rf.best_params_)

# Step 7: Stacking ensemble with best models
stack = StackingRegressor(
    estimators=[('rf', best_rf)],
    final_estimator=RandomForestRegressor(n_estimators=100, random_state=42),
    cv=5,
    n_jobs=-1,
    passthrough=True
)

print("Training stacking ensemble model...")
stack.fit(X_train, y_train)

# Step 8: Evaluate on the validation dataset
y_val_pred = stack.predict(X_val)
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Stacking Ensemble Validation RMSE: {rmse:.4f} seconds")

# Step 9: Feature importance analysis (based on Random Forest)
importances = best_rf.feature_importances_
features = X_train.columns

# Sort features by importance
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 8))
plt.title("Top 20 Feature Importances (Random Forest)")
plt.bar(range(20), importances[indices][:20], align="center")
plt.xticks(range(20), [features[i] for i in indices][:20], rotation=90)
plt.tight_layout()
plt.show()

# Optional: Drop least important features (e.g. features with importance < threshold) and retrain
importance_threshold = 0.005  # Adjust based on plot observation
important_features = features[importances > importance_threshold]

print(f"Reducing feature set from {len(features)} to {len(important_features)} features...")

X_train_reduced = X_train[important_features]
X_val_reduced = X_val[important_features]

# Retrain on reduced feature set
stack.fit(X_train_reduced, y_train)
y_val_pred_reduced = stack.predict(X_val_reduced)
rmse_reduced = mean_squared_error(y_val, y_val_pred_reduced, squared=False)
print(f"Validation RMSE after feature pruning: {rmse_reduced:.4f} seconds")


Tuning Random Forest parameters with larger folds...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Step 1: Load the datasets
try:
    train_df = pd.read_csv('train.csv')  # Load the training dataset
    val_df = pd.read_csv('val.csv')      # Load the validation dataset
    test_df = pd.read_csv('test.csv')    # Load the test dataset
    sample_submission = pd.read_csv('sample_submission.csv')  # Load the sample submission file
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise

# Step 2: Prepare the training data
X_train = train_df.drop(columns=['Lap_Time_Seconds'], errors='ignore')  # Replace with your target column name
y_train = train_df['Lap_Time_Seconds']  # Replace with your target column name

# Step 3: Prepare the validation data
X_val = val_df.drop(columns=['Lap_Time_Seconds'], errors='ignore')  # Replace with your target column name
y_val = val_df['Lap_Time_Seconds']  # Replace with your target column name

# Step 4: Encode the training and validation data
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, drop_first=True)

# Align the columns of the validation set with the training set
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Step 5: Train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train_encoded, y_train)

# Step 6: Make predictions on the validation set
y_val_pred = model.predict(X_val_encoded)

# Step 7: Evaluate the model
rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print("Root Mean Squared Error (RMSE) on validation set:", rmse)

# Step 8: Prepare the test data
X_test = test_df.drop(columns=['Unique ID'], errors='ignore')  # Drop any non-feature columns
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Align the columns of the test set with the training set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Step 9: Make predictions on the test set
test_predictions = model.predict(X_test_encoded)

# Step 10: Create the submission DataFrame
submission_df = pd.DataFrame({
    'Unique ID': test_df['Unique ID'],  # Include the Unique ID from the test set
    'Lap_Time_Seconds': test_predictions  # Include the predictions
})

# Step 11: Save the submission file as submission3.csv
submission_df.to_csv('submission3.csv', index=False)

print("Submission file 'submission3.csv' created successfully.")


In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [3]:
import pandas as pd
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
print("✅ Files loaded successfully.")


✅ Files loaded successfully.


In [4]:
# Separate features and target
X_train = train_df.drop(columns=['Lap_Time_Seconds'], errors='ignore')
y_train = train_df['Lap_Time_Seconds']

X_val = val_df.drop(columns=['Lap_Time_Seconds'], errors='ignore')
y_val = val_df['Lap_Time_Seconds']


In [5]:
from sklearn.preprocessing import OrdinalEncoder

# Detect categorical columns (object types)
cat_cols = X_train.select_dtypes(include='object').columns

# Use OrdinalEncoder (faster, efficient for large datasets)
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train[cat_cols] = encoder.fit_tansform(X_train[cat_cols])
X_val[cat_cols] = encoder.transform(X_val[cat_cols])



In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('val.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
print("✅ Files loaded successfully.")
# Step 2: Prepare the training data
X_train = train_df.drop(columns=['Lap_Time_Seconds'], errors='ignore')  # Replace with your target column name
y_train = train_df['Lap_Time_Seconds']  # Replace with your target column name

# Step 3: Prepare the validation data
X_val = val_df.drop(columns=['Lap_Time_Seconds'], errors='ignore')  # Replace with your target column name
y_val = val_df['Lap_Time_Seconds']  # Replace with your target column name

# Step 4: Encode the training and validation data
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, drop_first=True)

# Align the columns of the validation set with the training set
X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Step 5: Train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train_encoded, y_train)

# Step 6: Make predictions on the validation set
y_val_pred = model.predict(X_val_encoded)


✅ Files loaded successfully.


In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from math import sqrt
import warnings
warnings.filterwarnings('ignore')

# Load CSVs
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")

# Target column
target = "Lap_Time_Seconds"

# --- Drop NA only from training data ---
train_df = train_df.dropna(subset=[target])

# Combine for consistent preprocessing
df = pd.concat([train_df, val_df], axis=0, ignore_index=True)

# --- Feature Engineering ---
df['is_penalized'] = (df['Penalty'].notna() & (df['Penalty'] != 'None')).astype(int)
df['speed_per_km'] = df['Avg_Speed_kmh'] / df['Circuit_Length_km']
df['lap_time_per_km'] = df[target] / df['Circuit_Length_km']
df['laps_squared'] = df['Laps'] ** 2
df['temp_diff'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
df['humidity_ratio'] = df['Humidity_%'] / 100
df['combined_temp'] = df['Track_Temperature_Celsius'] + df['Ambient_Temperature_Celsius']

# Drop unnecessary columns
drop_cols = ['Unique ID', 'rider_name', 'shortname', 'team_name', 'bike_name', 'rider', 'team', 'bike', 'Penalty']
df = df.drop(columns=drop_cols, errors='ignore')

# Encode categorical columns
cat_cols = df.select_dtypes(include="object").columns.tolist()
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df[cat_cols] = encoder.fit_transform(df[cat_cols])

# Log-transform the target column
df[target] = np.log1p(df[target])

# Split the combined DataFrame back into train and validation sets
train_rows = train_df.shape[0]
train_data = df.iloc[:train_rows]
val_data = df.iloc[train_rows:]

X_train = train_data.drop(columns=[target])
y_train = train_data[target]
X_val = val_data.drop(columns=[target])
y_val = val_data[target]  # may contain NaN — won't be used for training

# Train LightGBM model
model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=8,
    num_leaves=64,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.5,
    reg_lambda=0.5,
    min_gain_to_split=0.001,
    random_state=42,
    n_jobs=-1
)

print("Training LightGBM...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val[~y_val.isna()], y_val[~y_val.isna()])],  # only use non-NaN rows for validation
    eval_metric="rmse",
    callbacks=[early_stopping(100), log_evaluation(100)]
)

# Predict on all val data
y_pred_log = model.predict(X_val)
y_pred = np.expm1(y_pred_log)

# Evaluate RMSE only on known validation targets
y_val_true = np.expm1(y_val[~y_val.isna()])
y_val_pred_known = y_pred[~y_val.isna()]

rmse = sqrt(mean_squared_error(y_val_true, y_val_pred_known))
print(f"\n✅ Final Validation RMSE (on known targets): {rmse:.4f}")

# --- Prepare submission with exactly 546874 rows ---
assert val_df.shape[0] == 546874, f"Expected 546874 rows, but got {val_df.shape[0]}"

submission = val_df[['Unique ID']].copy()
submission['Lap_Time_Seconds'] = y_pred

# Save submission file
submission.to_csv("submission2.csv", index=False)
print(f"\n📄 Submission file 'submission2.csv' has been created with {submission.shape[0]} rows.")


Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.405529 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4046
[LightGBM] [Info] Number of data points in the train set: 1914056, number of used features: 42
[LightGBM] [Info] Start training from score 4.502737
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.0562225	valid_0's l2: 0.00316097
[200]	valid_0's rmse: 0.0266165	valid_0's l2: 0.000708436
[300]	valid_0's rmse: 0.013014	valid_0's l2: 0.000169365
