In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, ExpSineSquared
from sklearn.model_selection import GridSearchCV
from supporting import remove_outliers_y, assign_val_data_to_clusters_with_labels,assign_test_data_to_clusters_with_labels, tune_ridge_regression, plot_clusters_features_vs_target, divide_data_by_kmeans_distance, remove_outliers, standardize_data, create_polynomial_features, apply_pca, rmse, tune_kernel_ridge_regression, tune_gaussian_process_regression

In [32]:
# importing the data
training_data = np.genfromtxt("training_data.csv", delimiter=',')
X = training_data[:,:-1]
y = training_data[:,-1]

X, y = remove_outliers_y(X, y, 50)

1


In [41]:
#####################################################
# USE THIS CELL TO TRAIN AND CREATE YOUR BEST MODEL # 
#####################################################

# Data Preprocessing

# based on our previous analysis and the random_state we set for KMeans clustering we know that:
# cluster 0 should be solved using Averaging (mean of features 2 > 61)
# cluster 1 should be solved using Bayesian Ridge (mean of features 2 <= 61)

def BayesianTrain(X_train, y_train):
    model = BayesianRidge()
    return model.fit(X_train, y_train)

def AveragingTrain(X_train, y_train):
    # Create the KNN Regressor model
    model_knr =  KNeighborsRegressor(n_neighbors=10, metric = 'manhattan')
    model_g = GaussianProcessRegressor(alpha=0.1, kernel=RationalQuadratic(alpha=0.1, length_scale=50), normalize_y= True)

    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both the training and testing data
    X_train_knr = scaler.fit_transform(X_train)

    pca_transformer = PCA(n_components=0.97)
    # Fit the PCA transformer on the training data and transform both the training and testing data
    X_train_knr = pca_transformer.fit_transform(X_train_knr)

    poly_transformer = PolynomialFeatures(degree=3, include_bias=False)

    # Fit the transformer on the training data and transform both the training and testing data
    X_train_g = poly_transformer.fit_transform(X_train)

    # Fit the model to the training data
    return model_knr.fit(X_train_knr, y_train), model_g.fit(X_train_g, y_train), scaler, pca_transformer, poly_transformer

def training_model(X, y, random_state=40):
    # Divide the data using KMeans
    (X_train_clusters, y_train_clusters), clusters, labels = divide_data_by_kmeans_distance(
        X, y, n_clusters=2, random_state=random_state
    )

    # Calculate the mean of the second feature for each cluster
    cluster_0_mean = clusters[0][1]
    cluster_1_mean = clusters[1][1]

    # This condition was only inserted to make sure that there was no issue in labels' assignment
    # Ensure that cluster 1 has a mean of the second feature below 61 and cluster 2 above 61
    if cluster_0_mean < 61 and cluster_1_mean > 61:
        print('Correct')
        X1_train, y1_train = X_train_clusters[0], y_train_clusters[0]
        X2_train, y2_train = X_train_clusters[1], y_train_clusters[1]
    elif cluster_1_mean < 61 and cluster_0_mean > 61:
        print('Had to switch')
        X1_train, y1_train = X_train_clusters[1], y_train_clusters[1]
        X2_train, y2_train = X_train_clusters[0], y_train_clusters[0]
        labels = [1, 0]
    else:
        raise ValueError("The clusters do not meet the expected conditions for the second feature.")

    # Train the models
    best_model_knr1, best_model_g1, scaler, pca_transformer, poly_transformer = AveragingTrain(X1_train, y1_train)
    best_model2 = BayesianTrain(X2_train, y2_train)

    def predict(X_test):
        
        # Get the clustered test data and their original indices
        X1_test, X2_test, idx1, idx2 = assign_test_data_to_clusters_with_labels(X_test, clusters, labels)

        X1_test_knr = scaler.transform(X1_test)

        X1_test_knr = pca_transformer.transform(X1_test_knr)

        # Fit the transformer on the training data and transform both the training and testing data
        X1_test_g = poly_transformer.transform(X1_test)

        # Handle case where no data is assigned to one of the clusters
        if X2_test.shape[0] == 0:
            # If no data is assigned to X2_test, treat the entire set as belonging to cluster 1 (KNR and GPR)
            X2_test = X1_test
            idx2 = idx1  # Update indices to ensure consistency
        elif X1_test.shape[0] == 0:
            # If no data is assigned to X1_test, treat the entire set as belonging to cluster 2 (Bayesian Ridge)
            X1_test = X2_test
            idx1 = idx2  # Update indices to ensure consistency

        # Make predictions for each cluster
        y2_pred = best_model2.predict(X2_test)  # Predictions for Bayesian Ridge
        y1_pred_g = best_model_g1.predict(X1_test_g)  # Predictions for Gaussian Process
        y1_pred_knr = best_model_knr1.predict(X1_test_knr)  # Predictions for KNN Regressor

        # Calculate inverse variances and handle case where variance is zero
        inv_var_knr = 1 / np.var(y1_pred_knr) if np.var(y1_pred_knr) != 0 else 0
        inv_var_g = 1 / np.var(y1_pred_g) if np.var(y1_pred_g) != 0 else 0

        total_variance = inv_var_knr + inv_var_g if inv_var_knr + inv_var_g != 0 else 1  # Avoid division by zero

        w_knr = inv_var_knr / total_variance
        w_g = inv_var_g / total_variance

        # Initialize a full array for y predictions
        y_pred_full = np.empty(X_test.shape[0])

        # Place predictions back in their original order
        if X1_test.shape[0] > 0:
            y_pred_full[idx1] = y1_pred_knr * w_knr + y1_pred_g * w_g
        if X2_test.shape[0] > 0:
            y_pred_full[idx2] = y2_pred

        return y_pred_full, y1_pred_knr, y2_pred, idx1, idx2


    return predict

In [42]:
# Loop to evaluate RMSE over multiple splits

from sklearn.metrics import r2_score

sum_test = 0
sum_train = 0

sum_r2 = 0
sum_r2_train = 0

for i in range(0, 41):
    # Split the data
    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=i)
    filtered_indices = y_test < 61  # Create a boolean mask for the condition

    # Apply the mask to both x_test and y_test
    x_test_filtered = x_test[filtered_indices]  # Filter x_test
    y_test_filtered = y_test[filtered_indices]

    # Train and predict
    best_model = training_model(x_train, y_train)
    y_pred_full, y1_pred_knr, y2_pred, idx1, idx2 = best_model(x_test)

    y_pred_train, y1_pred_train, y2_pred_train, idx1_train, idx2_train = best_model(x_train)

    # Calculate RMSE for full test set, cluster 1, and cluster 2
    rmse_full = rmse(y_test, y_pred_full)
    rmse_cluster1 = rmse(y_test[idx1], y1_pred_knr)
    rmse_cluster2 = rmse(y_test[idx2], y2_pred)

    rmse_full_train = rmse(y_train, y_pred_train)
    rmse_cluster1_train = rmse(y_train[idx1_train], y1_pred_train)
    rmse_cluster2_train = rmse(y_train[idx2_train], y2_pred_train)

    # Print RMSE for each iteration
    print(f"Iteration {i+1} - Full RMSE: {rmse_full}, Cluster 1 RMSE: {rmse_cluster1}, Cluster 2 RMSE: {rmse_cluster2}")
    print(f"Iteration {i+1} - Train RMSE: {rmse_full_train}, Cluster 1 RMSE: {rmse_cluster1_train}, Cluster 2 RMSE: {rmse_cluster2_train}")
    print(r2_score(y_test, y_pred_full))
    print(r2_score(y_train, y_pred_train))

    # Accumulate RMSE for average
    sum_test += rmse_full
    sum_train += rmse_full_train
    sum_r2 += r2_score(y_test, y_pred_full)
    sum_r2_train += r2_score(y_train, y_pred_train)

# Print average RMSE
print(f"Average RMSE over 40 splits: {sum_test / 40}")
print(f"Average RMSE over 40 splits: {sum_train / 40}")
print(f"Average RMSE over 40 splits: {sum_r2 / 40}")
print(f"Average RMSE over 40 splits: {sum_r2_train / 40}")


Correct




Iteration 1 - Full RMSE: 2.824229643448717, Cluster 1 RMSE: 3.9495945858016763, Cluster 2 RMSE: 7.241879337284717e-10
Iteration 1 - Train RMSE: 2.5825123124811165, Cluster 1 RMSE: 4.096381198291937, Cluster 2 RMSE: 7.516078894413401e-10
0.6829116150518442
0.7413194341251914
Correct




Iteration 2 - Full RMSE: 1.46135696332039, Cluster 1 RMSE: 2.189875372981512, Cluster 2 RMSE: 1.1373972548065722e-09
Iteration 2 - Train RMSE: 2.665667469981313, Cluster 1 RMSE: 4.2305309156975355, Cluster 2 RMSE: 8.710311690444489e-10
0.8155017102441486
0.765350490992157
Correct




Iteration 3 - Full RMSE: 2.9853980285291337, Cluster 1 RMSE: 3.9958735630744693, Cluster 2 RMSE: 1.063500167242289e-09
Iteration 3 - Train RMSE: 2.3827513368577637, Cluster 1 RMSE: 3.905139559132285, Cluster 2 RMSE: 7.868663293761112e-10
0.5657610384975836
0.7922623588713554
Had to switch




Iteration 4 - Full RMSE: 1.9934127689417112, Cluster 1 RMSE: 2.9931950450281546, Cluster 2 RMSE: 1.102433106809873e-09
Iteration 4 - Train RMSE: 2.7837663341901453, Cluster 1 RMSE: 4.32147574301706, Cluster 2 RMSE: 8.748078026134234e-10
0.7560342941738248
0.7301700194533417
Had to switch




Iteration 5 - Full RMSE: 4.051027736429689, Cluster 1 RMSE: 5.391459993708171, Cluster 2 RMSE: 8.651808799913307e-10
Iteration 5 - Train RMSE: 2.266056367717923, Cluster 1 RMSE: 3.630872360554677, Cluster 2 RMSE: 7.996087978542498e-10
0.5346409338862965
0.7708351355016583
Correct




Iteration 6 - Full RMSE: 2.295742170605521, Cluster 1 RMSE: 3.05864163992301, Cluster 2 RMSE: 9.218311601950641e-10
Iteration 6 - Train RMSE: 2.474206758920042, Cluster 1 RMSE: 4.087689538378987, Cluster 2 RMSE: 7.566101351059524e-10
0.6747466388146176
0.786771578243767
Correct




Iteration 7 - Full RMSE: 3.645701333621, Cluster 1 RMSE: 5.175998483141025, Cluster 2 RMSE: 8.15898800561053e-10
Iteration 7 - Train RMSE: 2.5239779610804907, Cluster 1 RMSE: 3.9639691642030868, Cluster 2 RMSE: 8.361598101296928e-10
0.584832914700134
0.7283899846730945
Had to switch




Iteration 8 - Full RMSE: 1.8975612702146305, Cluster 1 RMSE: 2.67127524264127, Cluster 2 RMSE: 7.600403661967822e-10
Iteration 8 - Train RMSE: 2.5838821494374935, Cluster 1 RMSE: 4.183039390863106, Cluster 2 RMSE: 7.698148556763022e-10
0.7896560335625553
0.7654256865458222
Had to switch




Iteration 9 - Full RMSE: 3.0562187019636564, Cluster 1 RMSE: 4.157560986916991, Cluster 2 RMSE: 8.299114578958035e-10
Iteration 9 - Train RMSE: 2.4389975039823284, Cluster 1 RMSE: 4.007955432572979, Cluster 2 RMSE: 7.506279667270186e-10
0.6181119118561001
0.7711978733383182
Correct




Iteration 10 - Full RMSE: 1.6350948111366386, Cluster 1 RMSE: 2.3568929481222787, Cluster 2 RMSE: 1.0015721924794647e-09
Iteration 10 - Train RMSE: 2.6252883047068956, Cluster 1 RMSE: 4.167406399763741, Cluster 2 RMSE: 8.562810166938125e-10
0.7967906592171373
0.7680632153734942
Had to switch




Iteration 11 - Full RMSE: 3.8119975094318046, Cluster 1 RMSE: 5.2778694813108125, Cluster 2 RMSE: 1.2387928511999124e-09
Iteration 11 - Train RMSE: 2.269124448469503, Cluster 1 RMSE: 3.669644609121687, Cluster 2 RMSE: 9.192343784220265e-10
0.5460387219550145
0.7805981436028255
Had to switch




Iteration 12 - Full RMSE: 2.879677797770808, Cluster 1 RMSE: 3.8693928360657344, Cluster 2 RMSE: 8.130422730295024e-10
Iteration 12 - Train RMSE: 2.3576404305164655, Cluster 1 RMSE: 3.9188398660146673, Cluster 2 RMSE: 7.518304764059747e-10
0.5211107520337773
0.804114399110279
Correct




Iteration 13 - Full RMSE: 2.4776600991116573, Cluster 1 RMSE: 3.5570946839395057, Cluster 2 RMSE: 1.0952038573591032e-09
Iteration 13 - Train RMSE: 2.51236108900905, Cluster 1 RMSE: 3.9944084709209813, Cluster 2 RMSE: 8.580087553303854e-10
0.6688469819016447
0.7744945598604149
Correct




Iteration 14 - Full RMSE: 3.7737193722655, Cluster 1 RMSE: 5.4254536320577165, Cluster 2 RMSE: 8.758821086209963e-10
Iteration 14 - Train RMSE: 2.3861486276576085, Cluster 1 RMSE: 3.757656793742059, Cluster 2 RMSE: 8.513454155206636e-10
0.5640775905032157
0.7544438815070645
Correct




Iteration 15 - Full RMSE: 2.4948043484445304, Cluster 1 RMSE: 3.461478571096337, Cluster 2 RMSE: 9.348717785763981e-10
Iteration 15 - Train RMSE: 2.4785716831150317, Cluster 1 RMSE: 4.071022879683643, Cluster 2 RMSE: 7.782013925056108e-10
0.6196316170260241
0.7860531851343713
Correct




Iteration 16 - Full RMSE: 1.3237283279932426, Cluster 1 RMSE: 1.7958979987425323, Cluster 2 RMSE: 7.373707154056069e-10
Iteration 16 - Train RMSE: 2.4720632094362447, Cluster 1 RMSE: 4.0924112246813715, Cluster 2 RMSE: 7.83318332791218e-10
0.7158232962951232
0.8079292822401358
Had to switch




Iteration 17 - Full RMSE: 5.20179351975563, Cluster 1 RMSE: 6.934838553488489, Cluster 2 RMSE: 1.112587081955494e-09
Iteration 17 - Train RMSE: 2.1719639215510207, Cluster 1 RMSE: 3.412664477153698, Cluster 2 RMSE: 8.238762329046133e-10
0.4537950994951391
0.7312698600506901
Correct




Iteration 18 - Full RMSE: 4.247873070951267, Cluster 1 RMSE: 5.954007258825671, Cluster 2 RMSE: 8.663508054497434e-10
Iteration 18 - Train RMSE: 2.0932246187858006, Cluster 1 RMSE: 3.4223328884518724, Cluster 2 RMSE: 8.34044487950501e-10
0.46330367595860267
0.8085238469105192
Had to switch




Iteration 19 - Full RMSE: 4.112056190582578, Cluster 1 RMSE: 5.593816478606965, Cluster 2 RMSE: 8.732963729491521e-10
Iteration 19 - Train RMSE: 2.2866769142485346, Cluster 1 RMSE: 3.668976614509924, Cluster 2 RMSE: 7.935550084549498e-10
0.5440827528545732
0.7564329897900981
Correct




Iteration 20 - Full RMSE: 3.167272650414763, Cluster 1 RMSE: 4.3871392622522976, Cluster 2 RMSE: 9.908266907762564e-10
Iteration 20 - Train RMSE: 2.265059104713941, Cluster 1 RMSE: 3.7337654320415505, Cluster 2 RMSE: 8.502845620962383e-10
0.46473377884622646
0.8161755381761214
Correct




Iteration 21 - Full RMSE: 2.503081502611301, Cluster 1 RMSE: 3.686984828325743, Cluster 2 RMSE: 9.274477107783919e-10
Iteration 21 - Train RMSE: 2.6107035455597827, Cluster 1 RMSE: 4.144732205999569, Cluster 2 RMSE: 8.540715100968373e-10
0.6727259604234146
0.7545156429545573
Correct




Iteration 22 - Full RMSE: 3.4435287693541987, Cluster 1 RMSE: 4.903681014322905, Cluster 2 RMSE: 9.439018167035406e-10
Iteration 22 - Train RMSE: 2.4785026805167902, Cluster 1 RMSE: 3.8900732094109283, Cluster 2 RMSE: 8.384007516644955e-10
0.6067677321305258
0.744742748506199
Had to switch




Iteration 23 - Full RMSE: 4.28662659042166, Cluster 1 RMSE: 6.062015042340485, Cluster 2 RMSE: 9.168859786755287e-10
Iteration 23 - Train RMSE: 2.1966286383106794, Cluster 1 RMSE: 3.5201227842547396, Cluster 2 RMSE: 8.429074912631884e-10
0.5253909247489336
0.772990694357915
Correct




Iteration 24 - Full RMSE: 3.2221308831120483, Cluster 1 RMSE: 4.640856282574711, Cluster 2 RMSE: 1.0997650907201115e-09
Iteration 24 - Train RMSE: 2.319039048771455, Cluster 1 RMSE: 3.752748970264821, Cluster 2 RMSE: 8.97260354552926e-10
0.5360980012939531
0.7978247465007975
Had to switch




Iteration 25 - Full RMSE: 3.808376162645529, Cluster 1 RMSE: 5.161142555755718, Cluster 2 RMSE: 8.232746398928787e-10
Iteration 25 - Train RMSE: 2.238396960926423, Cluster 1 RMSE: 3.689891157316476, Cluster 2 RMSE: 7.867772318517001e-10
0.4960910364894914
0.7961091221414638
Had to switch




Iteration 26 - Full RMSE: 3.4245005224611864, Cluster 1 RMSE: 4.7552425009182535, Cluster 2 RMSE: 9.83236453499659e-10
Iteration 26 - Train RMSE: 2.3994735433739844, Cluster 1 RMSE: 3.8997404827275033, Cluster 2 RMSE: 8.056169114433015e-10
0.5790564724560181
0.7684822356285452
Had to switch




Iteration 27 - Full RMSE: 3.905578464997617, Cluster 1 RMSE: 5.256139404076409, Cluster 2 RMSE: 7.976921077267061e-10
Iteration 27 - Train RMSE: 2.3187903639784104, Cluster 1 RMSE: 3.7756370858657626, Cluster 2 RMSE: 7.664954999522977e-10
0.5628820422956669
0.7613839382813674
Correct




Iteration 28 - Full RMSE: 3.928497705448101, Cluster 1 RMSE: 5.515442669287546, Cluster 2 RMSE: 9.350565020288246e-10
Iteration 28 - Train RMSE: 2.3278905906005396, Cluster 1 RMSE: 3.687681828882231, Cluster 2 RMSE: 8.12005289981797e-10
0.5767283611771863
0.7528258460077765
Had to switch




Iteration 29 - Full RMSE: 3.8559095321899695, Cluster 1 RMSE: 5.079828207575114, Cluster 2 RMSE: 9.385152479294936e-10
Iteration 29 - Train RMSE: 2.3982688428919237, Cluster 1 RMSE: 3.8396414522103215, Cluster 2 RMSE: 7.873168211310871e-10
0.5248230945229646
0.7576365398098742
Correct




Iteration 30 - Full RMSE: 3.729684333133711, Cluster 1 RMSE: 5.25350704823378, Cluster 2 RMSE: 7.367272308772319e-10
Iteration 30 - Train RMSE: 2.470557553754942, Cluster 1 RMSE: 3.914487988045909, Cluster 2 RMSE: 7.639545099968753e-10
0.5546520348938353
0.7419748542143834
Correct




Iteration 31 - Full RMSE: 2.4352487469102138, Cluster 1 RMSE: 3.416022475288058, Cluster 2 RMSE: 9.014173557118143e-10
Iteration 31 - Train RMSE: 2.4600579992466787, Cluster 1 RMSE: 4.010239690382738, Cluster 2 RMSE: 7.926247764194478e-10
0.6754588493411812
0.7845050326684706
Correct




Iteration 32 - Full RMSE: 2.904125950011683, Cluster 1 RMSE: 3.9975954812531134, Cluster 2 RMSE: 8.135712732220303e-10
Iteration 32 - Train RMSE: 2.4605023973262887, Cluster 1 RMSE: 4.0066343595158616, Cluster 2 RMSE: 7.58199000047143e-10
0.6524740019493582
0.7668931779186202
Had to switch




Iteration 33 - Full RMSE: 2.0493816752211083, Cluster 1 RMSE: 2.7136293221226464, Cluster 2 RMSE: 8.733670549711792e-10
Iteration 33 - Train RMSE: 2.588229677790987, Cluster 1 RMSE: 4.2123623218165775, Cluster 2 RMSE: 7.822081838158218e-10
0.7210311795460587
0.7702220358199937
Had to switch




Iteration 34 - Full RMSE: 2.1037088679407643, Cluster 1 RMSE: 3.04371753807367, Cluster 2 RMSE: 1.026749611651137e-09
Iteration 34 - Train RMSE: 2.588350410758209, Cluster 1 RMSE: 4.110333143455295, Cluster 2 RMSE: 8.598489991423973e-10
0.7218398234994692
0.7671373798120478
Had to switch




Iteration 35 - Full RMSE: 2.0983853843715607, Cluster 1 RMSE: 2.925191177845589, Cluster 2 RMSE: 9.655035210990236e-10
Iteration 35 - Train RMSE: 2.5727912123899896, Cluster 1 RMSE: 4.14447271111342, Cluster 2 RMSE: 8.038884508679768e-10
0.7127235914103309
0.7717041943264831
Had to switch




Iteration 36 - Full RMSE: 3.525916131871515, Cluster 1 RMSE: 4.955660532605911, Cluster 2 RMSE: 1.0371813034076786e-09
Iteration 36 - Train RMSE: 2.6786437615371823, Cluster 1 RMSE: 4.126721598419606, Cluster 2 RMSE: 8.265577875466667e-10
0.6065970056006249
0.6951196486356358
Correct




Iteration 37 - Full RMSE: 3.4104877968831766, Cluster 1 RMSE: 4.738364655552984, Cluster 2 RMSE: 9.26998517724913e-10
Iteration 37 - Train RMSE: 2.250901879936712, Cluster 1 RMSE: 3.712959756891759, Cluster 2 RMSE: 8.383815475926224e-10
0.4616228515947556
0.8120890790492605
Correct




Iteration 38 - Full RMSE: 2.2257960576292106, Cluster 1 RMSE: 2.9772063351100675, Cluster 2 RMSE: 7.821245191759174e-10
Iteration 38 - Train RMSE: 2.4436253474077154, Cluster 1 RMSE: 4.023122403144886, Cluster 2 RMSE: 7.916746614221961e-10
0.6254379905190073
0.796874385775268
Had to switch




Iteration 39 - Full RMSE: 1.958862049635708, Cluster 1 RMSE: 2.9803826667569497, Cluster 2 RMSE: 9.556360358470847e-10
Iteration 39 - Train RMSE: 2.743920474468158, Cluster 1 RMSE: 4.282075126782758, Cluster 2 RMSE: 8.359849617057117e-10
0.7723753139889526
0.7361157996051029
Had to switch




Iteration 40 - Full RMSE: 2.227943739755706, Cluster 1 RMSE: 3.180628192249839, Cluster 2 RMSE: 8.946283111640911e-10
Iteration 40 - Train RMSE: 2.522337869353307, Cluster 1 RMSE: 4.074982117856816, Cluster 2 RMSE: 8.212321859015394e-10
0.6559075519835764
0.7831210100112265
Correct




Iteration 41 - Full RMSE: 4.243651743160595, Cluster 1 RMSE: 5.647272784000874, Cluster 2 RMSE: 8.817374044852929e-10
Iteration 41 - Train RMSE: 2.317263775517244, Cluster 1 RMSE: 3.719003536813376, Cluster 2 RMSE: 7.559999331972368e-10
0.5377270840274121
0.7456168288700262
Average RMSE over 40 splits: 3.1156937231174857
Average RMSE over 40 splits: 2.500120428031903
Average RMSE over 40 splits: 0.6289710730191576
Average RMSE over 40 splits: 0.7879426601098933


In [51]:
#####################################################
# USE THIS CELL TO TRAIN AND CREATE YOUR BEST MODEL # 
#####################################################

# Data Preprocessing

# based on our previous analysis and the random_state we set for KMeans clustering we know that:
# cluster 0 should be solved using Averaging (mean of features 2 > 61)
# cluster 1 should be solved using Bayesian Ridge (mean of features 2 <= 61)

def BayesianTrain(X_train, y_train):
    model = BayesianRidge()
    return model.fit(X_train, y_train)

def AveragingTrain(X_train, y_train):
    # Create the KNN Regressor model
    model_knr = KNeighborsRegressor(n_neighbors=10, metric = 'manhattan')
    model_g = GaussianProcessRegressor(alpha=0.1, kernel=RationalQuadratic(alpha=0.1, length_scale=50))

    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both the training and testing data
    X_train = scaler.fit_transform(X_train)

    pca_transformer = PCA(n_components=0.97)
    # Fit the PCA transformer on the training data and transform both the training and testing data
    X_train = pca_transformer.fit_transform(X_train)

    # Fit the model to the training data
    knr = model_knr.fit(X_train, y_train)
    g = model_g.fit(X_train, y_train)

    y_pred_g = g.predict(X_train)  # Predictions for Gaussian Process
    y_pred_knr = knr.predict(X_train)  # Predictions for KNN Regressor

    # Calculate inverse variances and handle case where variance is zero
    inv_var_knr = 1 / np.var(y_pred_knr) if np.var(y_pred_knr) != 0 else 0
    inv_var_g = 1 / np.var(y_pred_g) if np.var(y_pred_g) != 0 else 0

    total_variance = inv_var_knr + inv_var_g if inv_var_knr + inv_var_g != 0 else 1  # Avoid division by zero

    w_knr = inv_var_knr / total_variance
    w_g = inv_var_g / total_variance

    print(w_g, w_knr)

    return knr, g, scaler, pca_transformer, w_knr, w_g

def training_model(X, y, random_state=40):
    # Divide the data using KMeans
    (X_train_clusters, y_train_clusters), clusters, labels = divide_data_by_kmeans_distance(
        X, y, n_clusters=2, random_state=random_state
    )

    # Calculate the mean of the second feature for each cluster
    cluster_0_mean = clusters[0][1]
    cluster_1_mean = clusters[1][1]

    # This condition was only inserted to make sure that there was no issue in labels' assignment
    # Ensure that cluster 1 has a mean of the second feature below 61 and cluster 2 above 61
    if cluster_0_mean < 61 and cluster_1_mean > 61:
        print('Correct')
        X1_train, y1_train = X_train_clusters[0], y_train_clusters[0]
        X2_train, y2_train = X_train_clusters[1], y_train_clusters[1]
    elif cluster_1_mean < 61 and cluster_0_mean > 61:
        print('Had to switch')
        X1_train, y1_train = X_train_clusters[1], y_train_clusters[1]
        X2_train, y2_train = X_train_clusters[0], y_train_clusters[0]
        labels = [1, 0]
    else:
        raise ValueError("The clusters do not meet the expected conditions for the second feature.")

    # Train the models
    best_model_knr1, best_model_g1, scaler, pca_transformer, w_knr, w_g = AveragingTrain(X1_train, y1_train)
    best_model2 = BayesianTrain(X2_train, y2_train)

    def predict(X_test):
        
        # Get the clustered test data and their original indices
        X1_test, X2_test, idx1, idx2 = assign_test_data_to_clusters_with_labels(X_test, clusters, labels)

        X1_test = scaler.transform(X1_test)

        X1_test = pca_transformer.transform(X1_test)

        # Handle case where no data is assigned to one of the clusters
        if X2_test.shape[0] == 0:
            # If no data is assigned to X2_test, treat the entire set as belonging to cluster 1 (KNR and GPR)
            X2_test = X1_test
            idx2 = idx1  # Update indices to ensure consistency
        elif X1_test.shape[0] == 0:
            # If no data is assigned to X1_test, treat the entire set as belonging to cluster 2 (Bayesian Ridge)
            X1_test = X2_test
            idx1 = idx2  # Update indices to ensure consistency

        # Make predictions for each cluster
        y2_pred = best_model2.predict(X2_test)  # Predictions for Bayesian Ridge
        y1_pred_g = best_model_g1.predict(X1_test)  # Predictions for Gaussian Process
        y1_pred_knr = best_model_knr1.predict(X1_test)  # Predictions for KNN Regressor

        # Initialize a full array for y predictions
        y_pred_full = np.empty(X_test.shape[0])

        # Place predictions back in their original order
        if X1_test.shape[0] > 0:
            y_pred_full[idx1] = y1_pred_knr * w_knr + y1_pred_g * w_g
        if X2_test.shape[0] > 0:
            y_pred_full[idx2] = y2_pred

        return y_pred_full, y1_pred_knr, y2_pred, idx1, idx2


    return predict

In [52]:
# Loop to evaluate RMSE over multiple splits

from sklearn.metrics import r2_score

sum_test = 0
sum_train = 0

sum_r2 = 0
sum_r2_train = 0

for i in range(0, 41):
    # Split the data
    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=i)
    filtered_indices = x_test[:, 1] < 61  # Create a boolean mask for the condition

    # Apply the mask to both x_test and y_test
    x_test_filtered = x_test[filtered_indices]  # Filter x_test
    y_test_filtered = y_test[filtered_indices]

    # Train and predict
    best_model = training_model(x_train, y_train)
    y_pred_full, y1_pred_knr, y2_pred, idx1, idx2 = best_model(x_test)

    y_pred_train, y1_pred_train, y2_pred_train, idx1_train, idx2_train = best_model(x_train)

    # Calculate RMSE for full test set, cluster 1, and cluster 2
    rmse_full = rmse(y_test, y_pred_full)
    rmse_cluster1 = rmse(y_test[idx1], y1_pred_knr)
    rmse_cluster2 = rmse(y_test[idx2], y2_pred)

    rmse_full_train = rmse(y_train, y_pred_train)
    rmse_cluster1_train = rmse(y_train[idx1_train], y1_pred_train)
    rmse_cluster2_train = rmse(y_train[idx2_train], y2_pred_train)

    # Print RMSE for each iteration
    print(f"Iteration {i+1} - Full RMSE: {rmse_full}, Cluster 1 RMSE: {rmse_cluster1}, Cluster 2 RMSE: {rmse_cluster2}")
    print(f"Iteration {i+1} - Train RMSE: {rmse_full_train}, Cluster 1 RMSE: {rmse_cluster1_train}, Cluster 2 RMSE: {rmse_cluster2_train}")
    print(r2_score(y_test, y_pred_full))
    print(r2_score(y_train, y_pred_train))

    # Accumulate RMSE for average
    sum_test += rmse_full
    sum_train += rmse_full_train
    sum_r2 += r2_score(y_test, y_pred_full)
    sum_r2_train += r2_score(y_train, y_pred_train)

# Print average RMSE
print(f"Average RMSE over 40 splits: {sum_test / 40}")
print(f"Average RMSE over 40 splits: {sum_train / 40}")
print(f"Average RMSE over 40 splits: {sum_r2 / 40}")
print(f"Average RMSE over 40 splits: {sum_r2_train / 40}")


Correct
0.19630388555068415 0.8036961144493158
Iteration 1 - Full RMSE: 2.773535884238292, Cluster 1 RMSE: 3.9495945858016763, Cluster 2 RMSE: 7.241879337284717e-10
Iteration 1 - Train RMSE: 2.5792429151853296, Cluster 1 RMSE: 4.096381198291937, Cluster 2 RMSE: 7.516078894413401e-10
0.694192664963088
0.7419739860528025
Correct
0.20891410476674285 0.7910858952332572
Iteration 2 - Full RMSE: 1.527191153238027, Cluster 1 RMSE: 2.189875372981512, Cluster 2 RMSE: 1.1373972548065722e-09
Iteration 2 - Train RMSE: 2.658090893061991, Cluster 1 RMSE: 4.2305309156975355, Cluster 2 RMSE: 8.710311690444489e-10
0.7985039591019139
0.7666824752049383
Correct
0.22145006491205954 0.7785499350879405
Iteration 3 - Full RMSE: 3.0814595786368653, Cluster 1 RMSE: 3.9958735630744693, Cluster 2 RMSE: 1.063500167242289e-09
Iteration 3 - Train RMSE: 2.372705055909891, Cluster 1 RMSE: 3.905139559132285, Cluster 2 RMSE: 7.868663293761112e-10
0.5373663115790639
0.7940104146036231
Had to switch
0.18667180348846402 0