In [227]:
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from torchmetrics.regression import R2Score
import torch
import pandas as pd

from torch.optim import Adam

from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    PolynomialFeatures,
    SplineTransformer,
    OneHotEncoder,
)
from sklearn.pipeline import Pipeline, FeatureUnion, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold, cross_validate
from sklearn.cluster import KMeans
from sklearn.gaussian_process import kernels, GaussianProcessRegressor

from bayes_opt import BayesianOptimization

from plotnine import (
    ggplot,
    aes,
    geom_histogram,
    facet_wrap,
    geom_point,
    geom_smooth,
    geom_bar,
)

torch.set_default_device("mps")

In [228]:
train = pd.read_csv("../data/train.csv", index_col="SEQN")
test = pd.read_csv("../data/test.csv", index_col="SEQN")
y = train.pop("y")
train.head()

Unnamed: 0_level_0,self_eval,teacher_eval,extracurricular,district,SRP_1,SRP_2,SRP_3,SRP_4,SRP_5,SRP_6,...,SRP_41,SRP_42,SRP_43,SRP_44,SRP_45,SRP_46,SRP_47,SRP_48,SRP_49,SRP_50
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
969167,4,5,9,3,-0.181,-0.379,-0.164,0.08,0.378,1.581,...,2.07,-1.156,-0.73,-0.508,-0.497,0.224,0.412,-0.517,0.099,0.114
188942,4,3,5,4,-0.126,1.603,1.021,0.489,-1.404,-0.955,...,1.478,-0.318,1.24,-1.993,2.021,-1.078,-0.277,0.802,0.253,-0.72
134058,1,2,8,5,0.724,-0.702,2.249,0.91,0.33,0.411,...,0.119,0.449,1.98,-0.401,-0.544,-0.944,1.592,0.875,-0.734,-2.336
124022,3,3,10,6,0.706,-0.302,1.023,-0.895,0.625,1.283,...,1.249,2.025,-2.289,-0.407,0.025,-0.515,0.408,1.38,-1.075,-2.451
685285,5,5,1,5,-0.35,-1.001,0.931,0.192,0.491,0.292,...,0.341,-0.118,-0.288,0.457,-0.566,0.822,-0.317,0.661,2.096,0.004


In [229]:
y

SEQN
969167   -1.315
188942    1.997
134058    3.709
124022    1.155
685285   -1.960
          ...  
970998   -0.139
971286    0.394
852862    0.597
138992    1.408
24075    -2.339
Name: y, Length: 8000, dtype: float64

In [235]:
class RowSum(BaseEstimator):

    def __init__(self) -> None:
        super().__init__()

    def fit(self, *args, **kwargs):
        pass

    def transform(self, X, y=None, **kwargs):
        if isinstance(X, pd.DataFrame):
            return X.values.sum(axis=1).reshape(-1, 1)
        else:
            return X.sum(axis=1).reshape(-1, 1)

    def fit_transform(self, X, y=None, **kwargs):
        return self.transform(X, y, **kwargs)

    def get_feature_names_out(self, X, y=None, **kwargs):
        return ["rowSums"]


sumRows = Pipeline(
    [
        ("sums", RowSum()),
        ("scale", StandardScaler()),
    ]
)

colTrans = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(sparse_output=False), ["district"]),
        ("collapse", sumRows, [f"SRP_{i}" for i in range(1, 51)]),
    ],
    remainder="passthrough",
)

preprocessor = Pipeline(
    [
        ("setup", colTrans),
        ("polys", PolynomialFeatures(interaction_only=True)),
        ("splines", SplineTransformer(knots="quantile", n_knots=10, extrapolation="continue")),
    ]
)

# pd.DataFrame(out, columns=preprocessor.get_feature_names_out())

In [237]:
model_results = []
for model in [
    LinearRegression(),
    # RandomForestRegressor(n_estimators=256, n_jobs=8)
    # HistGradientBoostingRegressor(max_iter=500),
]:
    model_pipeline = Pipeline([("data", preprocessor), ("model", model)])
    res = cross_validate(
        model_pipeline,
        X=train,
        y=y,
        scoring=make_scorer(r2_score),
        return_estimator=False,
    )
    print(res)
    print(res["test_score"].mean())
    model_results.append(res)

{'fit_time': array([0.74278879, 0.54474378, 0.87005687, 0.65807509, 1.05137706]), 'score_time': array([0.0091722 , 0.01896024, 0.0129559 , 0.01181793, 0.00823784]), 'test_score': array([ 0.87334896,  0.84110823,  0.8119575 , -0.16400286,  0.88635388])}
0.6497531431021331


In [111]:
final_model = Pipeline([("preprocess", preprocessor), ("model", model)]).fit(train, y=y)
test["y"] = final_model.predict(test)

In [None]:
colTrans = ColumnTransformer(
    [
        ("one_hot", OneHotEncoder(sparse_output=False), ["district"]),
        # ("collapse", sumRows, [f"SRP_{i}" for i in range(1, 51)]),
    ],
    remainder="passthrough",
)

nnet_prep = Pipeline(
    [
        ("setup", colTrans),
        ("scale", StandardScaler()),
        # ("polys", PolynomialFeatures(interaction_only=True)),
        # ("splines", SplineTransformer(knots="quantile", n_knots=5, extrapolation="linear")),
        ("tensor", FunctionTransformer(lambda x: torch.tensor(x, dtype=torch.float32))),
    ]
)

nnet_out = nnet_prep.fit_transform(train)

In [212]:
# deepnet = nn.Sequential(
#             nn.Linear(nnet_out.shape[1], 16),
#             nn.Sigmoid(),
#             nn.Linear(8, 4),
#             nn.Sigmoid(),
#             nn.Linear(4, 1),
#         )
# loss = nn.MSELoss()

# mse_loss = R2Score()
y_tensor = torch.tensor(y.values, dtype=torch.float32)
# print(y_tensor)

# tt_split = int(nnet_out.shape[0] * 0.9)

class NNetSklearn(BaseEstimator):

    def __init__(self, epochs=20, lr=0.01, loss_fn=nn.MSELoss, optimizer=Adam, **optim_args) -> None:
        super().__init__()
        self.model =  nn.Sequential(
            nn.Linear(nnet_out.shape[1], 40),
            nn.ReLU(),
            nn.Linear(40, 1),
        )
        self.loss = loss_fn()
        self.epochs = epochs
        self.lr = lr
        self.optimizer = optimizer(lr=lr, **optim_args)

    def predict(self, X):
        return self.model.forward(X)

    def fit(self, train_x, train_y, val_x, val_y):
        for epoch in self.epochs:
            preds = self.model(train_x).squeeze()
            train_loss = self.loss(train_y, preds)
            train_r2 = r2_score(train_y.detach().cpu().numpy(), train_y.detach().cpu().numpy())
            train_loss.backward()

            self.optimizer.step()
            self.optimizer.zero_grad()

            with torch.no_grad():
                val_hat = deepnet(val_x).squeeze()
                val_loss = mse_loss(val_y, val_hat)
                val_r2 = r2_score(val_y.detach().cpu().numpy(), val_hat.detach().cpu().numpy())

            if epoch % 10 == 0:
                print(f"Train MSE: {train_loss} -- Val MSE: {val_loss} | Train/Val R2: {val_r2} ")



# splits

# val_x, val_y = nnet_out[tt_split:], y_tensor[tt_split:]
# train_x, train_y = nnet_out[:tt_split], y_tensor[:tt_split]


In [226]:
# train_mse = 0.0
folder = KFold(n_splits=5, shuffle=True)
splits = folder.split(nnet_out, y_tensor)
for i, (train_id, val_id) in enumerate(splits):
    print(f"---------------------------------------- Split {i+1} ---------------------------------------- ")
    train_df, val_df = train.iloc[train_id], train.iloc[val_id]
    train_y, val_y = torch.tensor(y.iloc[train_id].values, dtype=torch.float32), torch.tensor(y.iloc[val_id].values, dtype=torch.float32)

    train_x = nnet_prep.fit_transform(train_df)
    val_x = nnet_prep.transform(val_df)
    
    deepnet = nn.Sequential(
            nn.Linear(nnet_out.shape[1], 25),
            nn.ReLU(),
            nn.Linear(25, 10),
            nn.ReLU(),
            nn.Linear(10, 1),
        )
    optimizer = torch.optim.Adam(deepnet.parameters(), lr=0.1, maximize=False)

    # train_x, train_y = nnet_out[train_id], y_tensor[train_id]
    # val_x, val_y = nnet_out[val_id], y_tensor[val_id]
    
    # mse_loss = nn.MSELoss()
    loss = nn.MSELoss()
    deepnet.train()
    for epoch in range(5000):
        # for x_val, y_val in train_dl:
        y_hat = deepnet(train_x).squeeze()
        train_loss = loss(train_y, y_hat)
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        with torch.no_grad():
            val_hat = deepnet(val_x).squeeze()
            # val_loss = mse_loss(val_y, val_hat)
            # train_mse = mse_loss(train_y, y_hat)
            r2_loss = R2Score()
            # train_r2 = r2_loss(train_y, y_hat)
            val_r2 = r2_loss(val_y, val_hat)
        if (epoch + 1) % 100 == 0:
            # print(f"Epoch {epoch+1} -- Train MSE: {train_loss.item()/train_x.shape[0]} -- Val MSE: {val_loss.item()/val_x.shape[0]} -- Train R2: {train_r2.item():0.3f} -- Val R2: {val_r2.item():0.3f}")
            print(f"Epoch {epoch+1} -- Train MSE: {train_loss.item():0.3f} -- Val R2: {val_r2.item():0.3f}")
            # print(mse / train_x.shape[0], val_loss / val_x.shape[0], val_r2)

---------------------------------------- Split 1 ---------------------------------------- 
Epoch 100 -- Train MSE: 0.343 -- Val R2: 0.818
Epoch 200 -- Train MSE: 0.204 -- Val R2: 0.827
Epoch 300 -- Train MSE: 0.198 -- Val R2: 0.817
Epoch 400 -- Train MSE: 0.195 -- Val R2: 0.831
Epoch 500 -- Train MSE: 0.186 -- Val R2: 0.811
Epoch 600 -- Train MSE: 0.188 -- Val R2: 0.808
Epoch 700 -- Train MSE: 0.173 -- Val R2: 0.826
Epoch 800 -- Train MSE: 0.175 -- Val R2: 0.813
Epoch 900 -- Train MSE: 0.173 -- Val R2: 0.821
Epoch 1000 -- Train MSE: 0.179 -- Val R2: 0.808
Epoch 1100 -- Train MSE: 0.164 -- Val R2: 0.826
Epoch 1200 -- Train MSE: 0.183 -- Val R2: 0.823
Epoch 1300 -- Train MSE: 0.161 -- Val R2: 0.822
Epoch 1400 -- Train MSE: 0.172 -- Val R2: 0.810
Epoch 1500 -- Train MSE: 0.180 -- Val R2: 0.810
Epoch 1600 -- Train MSE: 0.160 -- Val R2: 0.825
Epoch 1700 -- Train MSE: 0.164 -- Val R2: 0.825
Epoch 1800 -- Train MSE: 0.159 -- Val R2: 0.826
Epoch 1900 -- Train MSE: 0.169 -- Val R2: 0.824
Epoch 

KeyboardInterrupt: 

In [171]:
# train_mse = 0.0

# Full Training
train_x = nnet_prep.fit_transform(train)
train_y = y_tensor
    
deepnet = nn.Sequential(
        nn.Linear(nnet_out.shape[1], 24),
        nn.ReLU(),
        nn.Linear(24, 12),
        nn.ReLU(),
        nn.Linear(12, 6),
        nn.ReLU(),
        nn.Linear(6, 3),
        nn.ReLU(),
        nn.Linear(3, 1),
    )
optimizer = torch.optim.Adam(deepnet.parameters(), lr=0.01, maximize=True)

# deepnet.train()
loss = R2Score()
for epoch in range(40000):
    # for x_val, y_val in train_dl:
    y_hat = deepnet(train_x).squeeze()
    train_loss = loss(train_y, y_hat)
    train_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if (epoch+1) % 500 == 0:
        print(f"Epoch {epoch+1} -- Train R2: {train_loss.item():0.3f}")

Epoch 500 -- Train R2: -4.476
Epoch 1000 -- Train R2: -2.167
Epoch 1500 -- Train R2: -1.426
Epoch 2000 -- Train R2: -1.053
Epoch 2500 -- Train R2: -0.825
Epoch 3000 -- Train R2: -0.664
Epoch 3500 -- Train R2: -0.546
Epoch 4000 -- Train R2: -0.445
Epoch 4500 -- Train R2: -0.353
Epoch 5000 -- Train R2: -0.257
Epoch 5500 -- Train R2: -0.108
Epoch 6000 -- Train R2: 0.143
Epoch 6500 -- Train R2: 0.296
Epoch 7000 -- Train R2: 0.388
Epoch 7500 -- Train R2: 0.528
Epoch 8000 -- Train R2: 0.651
Epoch 8500 -- Train R2: 0.741
Epoch 9000 -- Train R2: 0.794
Epoch 9500 -- Train R2: 0.825
Epoch 10000 -- Train R2: 0.845
Epoch 10500 -- Train R2: 0.858
Epoch 11000 -- Train R2: 0.868
Epoch 11500 -- Train R2: 0.875
Epoch 12000 -- Train R2: 0.880
Epoch 12500 -- Train R2: 0.883
Epoch 13000 -- Train R2: 0.886
Epoch 13500 -- Train R2: 0.888
Epoch 14000 -- Train R2: 0.890
Epoch 14500 -- Train R2: 0.891
Epoch 15000 -- Train R2: 0.892
Epoch 15500 -- Train R2: 0.893
Epoch 16000 -- Train R2: 0.893
Epoch 16500 -- Tr

In [172]:
test_out = nnet_prep.transform(test)
test["y"] = deepnet.forward(test_out).squeeze().detach().cpu().numpy()
test["y"].to_csv("./preds.csv")

test["y"]

SEQN
492834    0.091143
309349    1.797333
468308    2.242740
838812   -0.085628
947936    2.875431
            ...   
971604    2.097824
2790      1.671832
159210   -0.081907
366040   -1.044782
901742   -0.484564
Name: y, Length: 4000, dtype: float32