In [207]:
from torch import nn
import torch
import pandas as pd

from torch.optim import Adam

from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    PolynomialFeatures,
    SplineTransformer,
    OneHotEncoder,
)
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold, cross_validate
from sklearn.cluster import KMeans
from sklearn.gaussian_process import kernels, GaussianProcessRegressor

from bayes_opt import BayesianOptimization

from plotnine import (
    ggplot,
    aes,
    geom_histogram,
    facet_wrap,
    geom_point,
    geom_smooth,
    geom_bar,
)

torch.set_default_device("mps")

In [175]:
train = pd.read_csv("../data/train.csv", index_col="SEQN")
test = pd.read_csv("../data/test.csv", index_col="SEQN")
y = train.pop("y")
train.head()

Unnamed: 0_level_0,self_eval,teacher_eval,extracurricular,district,SRP_1,SRP_2,SRP_3,SRP_4,SRP_5,SRP_6,...,SRP_41,SRP_42,SRP_43,SRP_44,SRP_45,SRP_46,SRP_47,SRP_48,SRP_49,SRP_50
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
969167,4,5,9,3,-0.181,-0.379,-0.164,0.08,0.378,1.581,...,2.07,-1.156,-0.73,-0.508,-0.497,0.224,0.412,-0.517,0.099,0.114
188942,4,3,5,4,-0.126,1.603,1.021,0.489,-1.404,-0.955,...,1.478,-0.318,1.24,-1.993,2.021,-1.078,-0.277,0.802,0.253,-0.72
134058,1,2,8,5,0.724,-0.702,2.249,0.91,0.33,0.411,...,0.119,0.449,1.98,-0.401,-0.544,-0.944,1.592,0.875,-0.734,-2.336
124022,3,3,10,6,0.706,-0.302,1.023,-0.895,0.625,1.283,...,1.249,2.025,-2.289,-0.407,0.025,-0.515,0.408,1.38,-1.075,-2.451
685285,5,5,1,5,-0.35,-1.001,0.931,0.192,0.491,0.292,...,0.341,-0.118,-0.288,0.457,-0.566,0.822,-0.317,0.661,2.096,0.004


In [211]:
class RowSum(BaseEstimator):

    def __init__(self) -> None:
        super().__init__()

    def fit(self, *args, **kwargs):
        pass

    def transform(self, X, y=None, **kwargs):
        if isinstance(X, pd.DataFrame):
            return X.values.sum(axis=1).reshape(-1, 1)
        else:
            return X.sum(axis=1).reshape(-1, 1)

    def fit_transform(self, X, y=None, **kwargs):
        return self.transform(X, y, **kwargs)

    def get_feature_names_out(self, X, y=None, **kwargs):
        return ["rowSums"]


sumRows = Pipeline(
    [
        ("sums", RowSum()),
        ("scale", StandardScaler()),
    ]
)

colTrans = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(sparse_output=False, drop="first"), ["district"]),
        ("collapse", sumRows, [f"SRP_{i}" for i in range(1, 51)]),
    ],
    remainder="passthrough",
)

preprocessor = Pipeline(
    [
        ("setup", colTrans),
        ("polys", PolynomialFeatures(interaction_only=True)),
        ("splines", SplineTransformer(knots="quantile")),
    ]
)

# pd.DataFrame(out, columns=preprocessor.get_feature_names_out())

In [212]:
model_results = []
for model in [
    LinearRegression(),
    # RandomForestRegressor(n_estimators=256, n_jobs=8)
    # HistGradientBoostingRegressor(max_iter=500),
]:
    model_pipeline = Pipeline([("data", preprocessor), ("model", model)])
    res = cross_validate(model_pipeline, X=train, y=y, scoring=make_scorer(r2_score), return_estimator=False)
    print(res)
    model_results.append(res)

{'fit_time': array([0.15550017, 0.20931816, 0.18200397, 0.14759707, 0.15532684]), 'score_time': array([0.00817394, 0.02113986, 0.02015209, 0.00836205, 0.00839496]), 'test_score': array([0.88683104, 0.88802109, 0.88618439, 0.87380475, 0.89838823])}


KeyboardInterrupt: 

In [None]:
final_model = Pipeline([("preprocess", preprocessor), ("model", model)]).fit(train, y=y)
test["y"] = final_model.predict(test)


In [234]:
# hist_cv
colTrans = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(sparse_output=False, drop="first"), ["district"]),
        ("collapse", sumRows, [f"SRP_{i}" for i in range(1, 51)]),
    ],
    remainder="passthrough",
)

preprocessor = Pipeline(
    [
        ("setup", colTrans),
        # ("polys", PolynomialFeatures(interaction_only=True)),
        # ("splines", SplineTransformer(knots="quantile")),
    ]
)

gp_kern = 1.5**2 * kernels.RBF() + 0.5**2 * kernels.WhiteKernel()
mod = Pipeline([("prep", preprocessor), ("mod", GaussianProcessRegressor(gp_kern))])

cv_res = cross_validate(mod, X=train, y=y, scoring=make_scorer(r2_score))
print(cv_res)

{'fit_time': array([61.86457491, 12.23607993, 12.69011211, 12.46443987, 65.54506397]), 'score_time': array([0.34809518, 0.34036803, 0.4498477 , 0.38145304, 0.34547806]), 'test_score': array([0.82867981, 0.81971713, 0.82209612, 0.81563986, 0.82498952])}
