In [114]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import cross_validate, StratifiedGroupKFold
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.preprocessing import SplineTransformer, StandardScaler, OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


from bayes_opt import BayesianOptimization

from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor, kernels
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
import numpy as np

In [115]:
train = pd.read_csv("../data/train.csv", index_col="SEQN")
train_x, train_y = train.drop("y", axis=1), train["y"]

# encoder = ColumnTransformer(
#     [
#         ("one_hot", OneHotEncoder(), ["district"]),
#     ],
#     remainder="passthrough",
# )

# encoded_x = encoder.fit_transform(train_x)

In [116]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


train

Unnamed: 0_level_0,self_eval,teacher_eval,extracurricular,district,SRP_1,SRP_2,SRP_3,SRP_4,SRP_5,SRP_6,...,SRP_42,SRP_43,SRP_44,SRP_45,SRP_46,SRP_47,SRP_48,SRP_49,SRP_50,y
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
969167,4,5,9,3,-0.181,-0.379,-0.164,0.080,0.378,1.581,...,-1.156,-0.730,-0.508,-0.497,0.224,0.412,-0.517,0.099,0.114,-1.315
188942,4,3,5,4,-0.126,1.603,1.021,0.489,-1.404,-0.955,...,-0.318,1.240,-1.993,2.021,-1.078,-0.277,0.802,0.253,-0.720,1.997
134058,1,2,8,5,0.724,-0.702,2.249,0.910,0.330,0.411,...,0.449,1.980,-0.401,-0.544,-0.944,1.592,0.875,-0.734,-2.336,3.709
124022,3,3,10,6,0.706,-0.302,1.023,-0.895,0.625,1.283,...,2.025,-2.289,-0.407,0.025,-0.515,0.408,1.380,-1.075,-2.451,1.155
685285,5,5,1,5,-0.350,-1.001,0.931,0.192,0.491,0.292,...,-0.118,-0.288,0.457,-0.566,0.822,-0.317,0.661,2.096,0.004,-1.960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970998,2,1,2,1,1.035,1.359,1.558,-1.530,-0.220,0.075,...,2.717,-1.184,-0.300,0.444,0.024,-0.745,-0.890,-1.192,-1.083,-0.139
971286,5,3,1,2,-0.236,-0.723,-1.624,-1.306,0.783,-0.105,...,1.417,-0.061,-0.123,-1.063,-0.128,2.908,-0.057,-0.303,-0.150,0.394
852862,4,3,2,5,0.233,-1.349,-0.876,-0.544,0.400,0.381,...,0.183,-0.480,-2.044,0.084,0.504,1.913,-0.471,0.454,0.691,0.597
138992,5,5,5,7,-1.041,2.261,1.334,0.562,0.767,1.965,...,0.346,-0.420,0.777,-0.739,0.999,0.876,-0.744,2.210,-0.421,1.408


In [117]:
scorer = make_scorer(r2_score)

pca_pipeline = Pipeline(
    steps=[
        (
            "scale",
            StandardScaler(),
        ),
        # (
        #     "scrp_pca",
        #     PCA(n_components=10),
        # ),
    ]
)

numeric_preprocessor = ColumnTransformer(
    [
        ("scale", StandardScaler(), ["self_eval", "teacher_eval", "extracurricular"]),
        ("pca", pca_pipeline, [f"SRP_{j}" for j in range(1, 51)]),
    ],
    remainder="drop"
)

splines = Pipeline([
    ("num_preprocess", numeric_preprocessor),
    # ("spline", SplineTransformer())
])

one_hot_encoder = ColumnTransformer([
    ("one_hot", OneHotEncoder(sparse_output=False), ["district"])
])
 

preprocessor = FeatureUnion([
    ("splines", splines),
    ("one_hot", one_hot_encoder)
])

preprocessor.fit_transform(train)

        # ("one_hot", OneHotEncoder(), ["district"]),
# preprocessor.fit(encoded_x)

cv_results = []
for model in [
    # LinearRegression(),
    RandomForestRegressor(n_estimators=512, n_jobs=8),
    HistGradientBoostingRegressor(max_iter=500)
]:
    model_pipeline = Pipeline(
        [("preprocess", preprocessor), ("model", model)]
    )
    cv_results.append(cross_validate(model_pipeline, X=train_x, y=train_y, scoring=scorer))

In [118]:
cv_results

[{'fit_time': array([18.93142796, 19.00621486, 18.89324188, 19.01053214, 18.90272021]),
  'score_time': array([0.03334212, 0.03504109, 0.03177905, 0.03200364, 0.0309329 ]),
  'test_score': array([0.59160972, 0.6130543 , 0.60183161, 0.580613  , 0.62486523])},
 {'fit_time': array([1.93806505, 1.80181813, 1.85482788, 1.82628989, 1.82039881]),
  'score_time': array([0.01448822, 0.01268768, 0.01363897, 0.01294112, 0.01388693]),
  'test_score': array([0.77295202, 0.7945964 , 0.79216024, 0.76630222, 0.79167281])}]