In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

In [2]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
# test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')

In [3]:
# test preprocessing helper functions
def binary(x):
    return np.int8(1) if np.isnan(x) else np.int8(0)

# preprocessing test dataset
def preprocessing_data(data):
    #unnecessary cols ['row_id']
    data.drop(columns=['row_id'], inplace=True)
    
    #missing values overmultiples rows
    data.dropna(subset = ['imbalance_size', 'reference_price', 'matched_size',
                       'bid_price', 'ask_price','wap'],
                inplace = True)
    
    #(0/1) flags for imputed data points
    data['far_price_missing'] = data['far_price'].apply(binary)
    data['near_price_missing'] = data['near_price'].apply(binary)
    
    #data scaling
    col_to_scale = [
         'imbalance_size',
         'reference_price',
         'matched_size',
         'far_price',
         'near_price',
         'bid_price',
         'bid_size',
         'ask_price',
         'ask_size',
         'wap',
         'far_price_missing',
         'near_price_missing'
                    ]
    scaler = MinMaxScaler(feature_range=(0, 1)).set_output(transform='pandas')
    data[col_to_scale] = scaler.fit_transform(data[col_to_scale])

    #inference missing price data
    imputer = impute.IterativeImputer(random_state=21)
    imputer.set_output(transform='pandas')
    data = imputer.fit_transform(data)
    
    
    #get_dummies
    cols_to_dummies = [
        'stock_id',
        'date_id',
        'seconds_in_bucket',
        'imbalance_buy_sell_flag'
        ]
    data = pd.get_dummies(data, 
                       columns=cols_to_dummies,
                       sparse=True, 
                       drop_first=True, 
                       dtype=np.int8)
    
    return data

In [4]:
# Helper functions

def generate_batch(X, y, chunksize):
    start = 0
    while start < len(X):
        end = min(start + chunksize, len(X))
        X_chunk, y_chunk = X[start:end], y[start:end]
        yield X_chunk, y_chunk
        start += chunksize
        
def batch_generator_test(X, chunk_size):
    start = 0
    while start < len(X):
        end = min(start + chunk_size, len(X))
        X_chunk = X[start:end]
        yield X_chunk
        start += chunk_size
        
def train_model(model, X, y):
    batch_generator = generate_batch(X, y, chunksize=10000)
    scores = []
    for X_chunk, y_chunk in batch_generator:
        model.partial_fit(X_chunk, y_chunk)
    
    return model

def scoring(train_model, X, y):
    batch_generator = generate_batch(X, y, chunksize=10000)
    scores = []
    for X_chunk, y_chunk in batch_generator:
        scores.append(mean_absolute_error(y_chunk, train_model.predict(X_chunk)))
        print(mean_absolute_error(y_chunk, train_model.predict(X_chunk)))
    
    return scores

def save_predictions(train_model, X, chunk_size=10000):
    predictions = []
    for X_chunk in batch_generator_test(X, chunk_size=10000):
        predictions += train_model.predict(X_chunk).tolist()
    return pd.DataFrame(predictions)

In [5]:
from sklearn.linear_model import SGDRegressor
model = SGDRegressor(loss='huber', epsilon=0.1, penalty='l1', fit_intercept=True,
                     validation_fraction=.3, tol=1e-2)

In [None]:
# baseline_submission
import optiver2023
# optiver2023.make_env.func_dict['__called__'] = False

env = optiver2023.make_env()
iter_test = env.iter_test()


counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    test_ = preprocessing_data(test)
    train = preprocessing_data(pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv'))
    common_features = list(set(train.columns) & set(test_.columns))
    X = train[common_features]
    y = train['target']
    baseline_model = train_model(model, X, y)
    sample_prediction['target'] = model.predict(test_[common_features])
    env.predict(sample_prediction)
    counter += 1