In [1]:
# !pip install textstat textblob pandarallel unidecode polars==0.19.13

In [2]:
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import lightgbm as lgb
from tqdm.auto import tqdm
import re
import pandas as pd
import numpy as np
import spacy
import string
from sklearn.metrics import mean_squared_error
from utils import TextProcessor, RawProcessor, revealing_text
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold

import polars as pl

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import warnings
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
train_scores = pd.read_csv('../raw_data/train_scores.csv')
raw_df = pd.read_csv("../raw_data/train_logs.csv")

In [4]:
train_raw_feats = RawProcessor(raw_df)

Creating kpps features...
Creating event-count features...
Creating numerical-categorical aggregation features...
Creating pause features...
Creating PR-Burst features...


In [5]:
reveal_results = raw_df.groupby("id").parallel_apply(revealing_text)
df = pd.DataFrame(reveal_results.tolist(), columns=["id", "revealed_text"])
df = df.merge(train_scores, how="left", on="id")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=309), Label(value='0 / 309'))), HB…

In [6]:
df = TextProcessor(df)

creating complete features


In [7]:
df = df.merge(train_raw_feats, how="left", on="id")

In [8]:
df["text_length_timeratio"] = df["text_length"] / (df["up_time_max"] / 1000)

## Train

In [9]:
feature_cols = df.drop(["id", "revealed_text", "score", "word", "sent", "paragraph"], axis=1).columns
label = "score"

In [10]:
from collections import defaultdict
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

rmse = lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False)

drop_cols = ["id", "score", "fold"]
oof_df = pd.DataFrame()
models = defaultdict(list)

models_to_ensemble = [
    "lgbm",
    "xgboost",
    "catboost"
]

model_params = {
    "catboost":{
        'learning_rate': 0.024906985231770738, 'depth': 5, 'l2_leaf_reg': 3.7139894959529283, 'subsample': 0.18527466886647015, 'colsample_bylevel': 0.6552973951000719, 'min_data_in_leaf': 93,
                "silent": True,
                "iterations": 1000,
                "random_state": 1,
               "use_best_model":False},

    "lgbm":  {
        'reg_alpha': 1.0894488472899402, 'reg_lambda': 6.290929934336985, 'colsample_bytree': 0.6218522907548012, 'subsample': 0.9579924238280629, 'learning_rate': 0.0027076430412427566, 'max_depth': 8, 'num_leaves': 947, 'min_child_samples': 57,
        'n_estimators': 2500,
        'metric': 'rmse',
         'random_state': 42,
        'verbosity': -1,
        'force_col_wise': True
             },
    
    "xgboost": {'max_depth': 2, 'learning_rate': 0.009998236038809146, 'n_estimators': 1000, 'min_child_weight': 17, 'gamma': 0.1288249858838246, 'subsample': 0.5078057280148618, 'colsample_bytree': 0.7355762136239921, 'reg_alpha': 0.670956206987811, 'reg_lambda': 0.06818351284100388, 'random_state': 1,
                "tree_method": "gpu_hist",
               }

}

skf = StratifiedKFold(n_splits=10,random_state=1111111, shuffle=True)
splits = list(skf.split(df, df["score"].astype(str)))


for idx, model_name in enumerate(models_to_ensemble):
    params = model_params[model_name]
    oof_folds = pd.DataFrame()
    print(f"Started the {model_name} model...")
    for i, (train_index, test_index) in tqdm(enumerate(splits)):
        if model_name == "lgbm":
            model = LGBMRegressor(**params)
        elif model_name == "xgboost":
            model = xgb.XGBRegressor(**params)
        elif model_name == "catboost":
            model = CatBoostRegressor(**params)
        else:
            raise ValueError("Unknown base model name.")

        x_train, y_train = df.loc[train_index, feature_cols].reset_index(drop=True), df.loc[train_index, label].reset_index(drop=True)
        x_valid, y_valid = df.loc[test_index, feature_cols].reset_index(drop=True), df.loc[test_index, label].reset_index(drop=True)
        x_valid, y_valid = df.loc[test_index, feature_cols].reset_index(drop=True), df.loc[test_index, label].reset_index(drop=True)
        ids = df.loc[test_index, "id"].reset_index(drop=True)
        
        if model_name != "lgbm":
            model.fit(x_train,
                      y_train)
        else:
            model.fit(x_train,
                      y_train,
                      callbacks = [lgb.log_evaluation(period=0)]
                     )

        val_preds = model.predict(x_valid)
        
        oof_fold = pd.concat(
            [ids, y_valid, pd.Series(val_preds)], 
            axis=1).rename({0: f"{model_name}_preds"}, axis="columns")
        oof_folds = pd.concat([oof_folds, oof_fold])
        models[model_name].append(model)
        print(f"Fold: {i} - Score: {rmse(oof_fold['score'], oof_fold[f'{model_name}_preds']):.5f}")
    
    if idx == 0:
        oof_df = pd.concat([oof_df, oof_folds])
    else:
        oof_df[f"{model_name}_preds"] = oof_folds[f"{model_name}_preds"]
    cv_score = rmse(oof_df["score"], oof_df[f"{model_name}_preds"])
    print(f"{model_name} cv_score: ", round(cv_score, 5))

Started the lgbm model...


0it [00:00, ?it/s]

Fold: 0 - Score: 0.57585
Fold: 1 - Score: 0.61747
Fold: 2 - Score: 0.53880
Fold: 3 - Score: 0.59232
Fold: 4 - Score: 0.63189
Fold: 5 - Score: 0.58217
Fold: 6 - Score: 0.60365
Fold: 7 - Score: 0.60724
Fold: 8 - Score: 0.67683
Fold: 9 - Score: 0.60274
lgbm cv_score:  0.60387
Started the xgboost model...


0it [00:00, ?it/s]

Fold: 0 - Score: 0.56781
Fold: 1 - Score: 0.60417
Fold: 2 - Score: 0.54103
Fold: 3 - Score: 0.59197
Fold: 4 - Score: 0.62732
Fold: 5 - Score: 0.57899
Fold: 6 - Score: 0.59706
Fold: 7 - Score: 0.61034
Fold: 8 - Score: 0.66691
Fold: 9 - Score: 0.59887
xgboost cv_score:  0.5993
Started the catboost model...


0it [00:00, ?it/s]

Fold: 0 - Score: 0.56605
Fold: 1 - Score: 0.59448
Fold: 2 - Score: 0.52762
Fold: 3 - Score: 0.58715
Fold: 4 - Score: 0.61978
Fold: 5 - Score: 0.58302
Fold: 6 - Score: 0.59529
Fold: 7 - Score: 0.60183
Fold: 8 - Score: 0.66257
Fold: 9 - Score: 0.60102
catboost cv_score:  0.59478


In [11]:
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize

pred_cols = [f"{model_name}_preds" for model_name in models_to_ensemble]
true_targets = oof_df["score"]

def objective_function(weights):
    ensemble_preds = (oof_df[pred_cols] * weights).sum(axis=1)
    score = rmse(oof_df["score"], ensemble_preds)
    return score

def find_weights(oof_df):
    len_models = len(models_to_ensemble)
    initial_weights = np.ones(len_models) / len_models
    bounds = [(0, 1)] * len_models
    result = minimize(objective_function, initial_weights, bounds=bounds, method='SLSQP') # L-BFGS-B
    optimized_weights = result.x
    optimized_weights /= np.sum(optimized_weights)
    return optimized_weights

optimized_weights = find_weights(oof_df)
print("Optimized Weights:", optimized_weights)

Optimized Weights: [0.12952723 0.05684494 0.81362782]


In [12]:
oof_df["ensemble_optimized_preds"] = (oof_df[pred_cols] * optimized_weights).sum(axis=1)
cv_optimized = rmse(oof_df["score"], oof_df["ensemble_optimized_preds"])
print("cv_score with optimized weights: ", round(cv_optimized, 5))

cv_score with optimized weights:  0.5945


In [13]:
optimized_weights, models_to_ensemble

(array([0.12952723, 0.05684494, 0.81362782]), ['lgbm', 'xgboost', 'catboost'])

In [14]:
import pickle
with open('allmodels.mdls', 'wb') as fp:
    pickle.dump(models, fp)