# Mainly for model training

Depending on the size of your training set, you will need an [inference notebook](https://www.kaggle.com/code/regisvargas/inference-jane-street-a-beginner-s-notebook).

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport

# Initialize a list to hold samples from each file
samples = []
# Load a sample from each file
for i in range(10):
    file_path = f"./database/train.parquet/partition_id={i}/part-0.parquet"
    chunk = pd.read_parquet(file_path)
    
    # Take a sample of the data (adjust sample size as needed)
    sample_chunk = chunk.sample(n=50000, random_state=42)  # For example, 100 rows
    samples.append(sample_chunk)
# Concatenate all samples into one DataFrame if needed
sample_df = pd.concat(samples, ignore_index=True)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
sample_df.to_csv('sample.csv', index=False)

In [None]:
sample_df.profile_report()

# Prepare data

In [2]:
# Separate features and responders
features = sample_df.filter(regex='^feature_')
responders = sample_df.filter(regex='^responder_')
weights = sample_df['weight']
# Convert to numpy arrays for TensorFlow
X = features.values  # Features for input
#y = responders.values  # Responders for output
# Assuming you have a DataFrame `y_train` with all responders
y = responders[['responder_6']].values  # Keep only responder_6
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)

# XGBoost

See [Feature engineering, xgboost](https://www.kaggle.com/code/dlarionov/feature-engineering-xgboost#Part-2,-xgboost) for details.

In [3]:
X_train, X_val, y_train, y_val, weights_train, weights_val = train_test_split(
    X, y, weights, test_size=0.2, random_state=42
)

In [4]:
# Define a learning rate schedule
def learning_rate_scheduler_xgb(epoch):
    initial_rate = 0.3
    decay_rate = 0.999
    return initial_rate * (decay_rate ** (np.log(epoch)))

In [10]:
from xgboost import XGBRegressor
# Create an XGBoost model
model_xgb = XGBRegressor(
    n_estimators=1000,
    learning_rate=learning_rate_scheduler_xgb,
    max_depth=8,
    random_state=42,
    eval_metric='rmse',
    early_stopping_rounds=10
)

# Fit the model with sample weights and validation dataset
model_xgb.fit(
    X_train,
    y_train,
    sample_weight=weights_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    sample_weight_eval_set=[weights_train, weights_val],
    verbose=True
)


[0]	validation_0-rmse:0.85303	validation_1-rmse:0.85323
[1]	validation_0-rmse:0.85130	validation_1-rmse:0.85214
[2]	validation_0-rmse:0.85001	validation_1-rmse:0.85153
[3]	validation_0-rmse:0.84881	validation_1-rmse:0.85095
[4]	validation_0-rmse:0.84785	validation_1-rmse:0.85057
[5]	validation_0-rmse:0.84705	validation_1-rmse:0.85031
[6]	validation_0-rmse:0.84619	validation_1-rmse:0.84999
[7]	validation_0-rmse:0.84546	validation_1-rmse:0.84976
[8]	validation_0-rmse:0.84476	validation_1-rmse:0.84958
[9]	validation_0-rmse:0.84414	validation_1-rmse:0.84937
[10]	validation_0-rmse:0.84334	validation_1-rmse:0.84910
[11]	validation_0-rmse:0.84270	validation_1-rmse:0.84894
[12]	validation_0-rmse:0.84206	validation_1-rmse:0.84879
[13]	validation_0-rmse:0.84154	validation_1-rmse:0.84870
[14]	validation_0-rmse:0.84088	validation_1-rmse:0.84846
[15]	validation_0-rmse:0.84008	validation_1-rmse:0.84812
[16]	validation_0-rmse:0.83961	validation_1-rmse:0.84801
[17]	validation_0-rmse:0.83902	validation

In [17]:
y_pred = model_xgb.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_val, y_pred, squared=False)
r2 = r2_score(y_val, y_pred)
print(f"RMSE: {mse}")
print(f"R²: {r2}")

RMSE: 0.8782881498336792
R²: 0.04110771417617798




: 

In [None]:
import joblib
# Save the model
joblib.dump(model_xgb, "xgboost_sklearn.pkl")

['xgboost_sklearn.pkl']

# Submission

See [Jane Street RMF Demo Submission](https://www.kaggle.com/code/ryanholbrook/jane-street-rmf-demo-submission) for details.

Depending on the size of your training set, you will need an [inference notebook](https://www.kaggle.com/code/regisvargas/inference-jane-street-a-beginner-s-notebook).

In [15]:
import os
import polars as pl
import kaggle_evaluation.jane_street_inference_server

In [None]:
import polars as pl
import numpy as np
# Assuming `model` is your trained model
# Assuming features required by the model are named 'feature_00', 'feature_01', etc.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    global lags_
    if lags is not None:
        lags_ = lags
    # Extract the features for the model input
    feature_columns = [col for col in test.columns if col.startswith("feature_")]
    features = test.select(feature_columns).to_numpy()  # Convert to numpy array for model input
    features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
    # Generate predictions using the model
    #model_predictions = model.predict(features)
    responder_6_predictions = model_xgb.predict(features)
   # print(responder_6_predictions)    
    #responder_6_predictions = model_predictions[:, 6]  # Assuming responder_6 is at index 6
    # Create a new Polars DataFrame with row_id and responder_6 predictions
    predictions = test.select("row_id").with_columns(
        pl.Series("responder_6", responder_6_predictions)
    )
    print(predictions)
    # Ensure the output format and length requirements
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    
    assert len(predictions) == len(test)
    return predictions

In [17]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

shape: (39, 2)
┌────────┬─────────────┐
│ row_id ┆ responder_6 │
│ ---    ┆ ---         │
│ i64    ┆ f32         │
╞════════╪═════════════╡
│ 0      ┆ 0.083174    │
│ 1      ┆ 0.083174    │
│ 2      ┆ 0.083174    │
│ 3      ┆ 0.083174    │
│ 4      ┆ 0.083174    │
│ …      ┆ …           │
│ 34     ┆ 0.083174    │
│ 35     ┆ 0.083174    │
│ 36     ┆ 0.083174    │
│ 37     ┆ 0.083174    │
│ 38     ┆ 0.083174    │
└────────┴─────────────┘
