In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

# Load data from CSV files
training_data = pd.read_csv('/content/training_data.csv')
test_data = pd.read_csv('/content/testing_data.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

# Preprocess data
training_data['timestamp'] = pd.to_datetime(training_data['timestamp'], errors='coerce')
test_data['timestamp'] = pd.to_datetime(test_data['timestamp'], errors='coerce')

# Convert 'ram_limit' to numeric
training_data['ram_limit'] = training_data['ram_limit'].str.replace('M', '').astype(float)
test_data['ram_limit'] = test_data['ram_limit'].str.replace('M', '').astype(float)

# Extract date features from timestamp
for df in [training_data, test_data]:
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month

# Define numeric and categorical features
numeric_features = ['cpu_limit', 'ram_limit', 'cpu_usage', 'ram_usage', 'num_req', 'conc_lvl', 'hour', 'day_of_week', 'month']
categorical_features = ['cpu_type']

# Preprocessing pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(random_state=42))
])

# Split data into features and target
X = training_data.drop(columns=['latency', 'timestamp'])
y = training_data['latency']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__num_leaves': [31, 62, 127]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Validate the model
y_val_pred = best_model.predict(X_val)
r2 = r2_score(y_val, y_val_pred)
print('Validation R²:', r2)

# Predict on test data
X_test = test_data.drop(columns=['timestamp'])
predictions = best_model.predict(X_test)

# Ensure sample_submission has the correct length
if len(predictions) == len(sample_submission):
    sample_submission['Prediction'] = predictions
else:
    print("Warning: The number of predictions does not match the number of rows in sample_submission.")
    # Create a new submission DataFrame with the correct structure if needed
    sample_submission = pd.DataFrame({
        'ID': test_data['ID'],  # Assuming 'ID' is a column in your test data
        'Prediction': predictions
    })

# Save the submission file
sample_submission.to_csv('/content/submission.csv', index=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 848
[LightGBM] [Info] Number of data points in the train set: 32907, number of used features: 116
[LightGBM] [Info] Start training from score 6489539.216337
Validation R²: 0.9631131048196663
