In [74]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, RBF, Matern, RationalQuadratic, WhiteKernel
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer

In [None]:
def data_loading_enc():
    # Load training data
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")

    #drop priceCHF into y_train
    y_train = train_df['price_CHF']
    train_df = train_df.drop(['price_CHF'],axis=1)

    #scale train and test with the same scaler (scaling is done while One hot encoding)
    scaler = StandardScaler()

    #One Hot encoding of seasons
    encoder = OneHotEncoder(sparse=False)
    encoded_seasons = encoder.fit_transform(train_df['season'].values.reshape(-1, 1))
    encoded_seasons_test = encoder.transform(test_df['season'].values.reshape(-1, 1))
    encoded_seasons_df = pd.DataFrame(encoded_seasons, columns=encoder.get_feature_names_out(['season']))
    encoded_seasons_df_test = pd.DataFrame(encoded_seasons_test, columns=encoder.get_feature_names_out(['season']))
    train_df = pd.concat([encoded_seasons_df, pd.DataFrame(scaler.fit_transform(train_df.drop('season', axis=1)))], axis=1)
    test_df = pd.concat([encoded_seasons_df_test, pd.DataFrame(scaler.transform(test_df.drop('season', axis=1)))], axis=1)

    #saving and modifying col names
    colnames_test = test_df.columns
    colnames_list = colnames_test.to_list()
    colnames_list.append('price_CHF')
    
    #imputing training set with It.I
    imp = IterativeImputer(max_iter=1000, random_state=0)
    imp.fit(train_df)
    imp_df = pd.DataFrame(imp.transform(train_df))

    #imputing priceCHF with KNN
    imp_df_y = pd.concat([imp_df, y_train], axis=1)
    imputer = KNNImputer(n_neighbors=2)
    imp_df_y = pd.DataFrame(imputer.fit_transform(imp_df_y))
    imp_df_y.columns = colnames_list
    
    #imputing test set with It.
    imp_test = IterativeImputer(max_iter=1000, random_state=0)
    imp_test_df = pd.DataFrame(imp.transform(test_df))
    imp_test_df.columns = colnames_test

    #extracting X_train, y_train and X_test
    X_train = imp_df_y.drop(['price_CHF'],axis=1).to_numpy()
    y_train = imp_df_y['price_CHF'].to_numpy()
    X_test = imp_test_df

    assert (X_train.shape[1] == X_test.shape[1]) and (X_train.shape[0] == y_train.shape[0]) and (X_test.shape[0] == 100), "Invalid data shape"
    return X_train, y_train, X_test

In [76]:
def average_LR_RMSE(X, y, kernels, n_folds, alpha):

    RMSE_mat = np.zeros((n_folds, len(kernels)))

    kf = KFold(n_splits=n_folds)
    for i, (train, test) in enumerate(kf.split(X)):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        for ii, kernel in enumerate(kernels):
            gpr = GaussianProcessRegressor(kernel=kernel, alpha=alpha)
            gpr.fit(X_train, y_train)
            y_pred = gpr.predict(X_test)
            MSE = mean_squared_error(y_test, y_pred)
            RMSE = np.sqrt(MSE)
            RMSE_mat[i][ii] = RMSE

    avg_RMSE = np.mean(RMSE_mat, axis=0)
    assert avg_RMSE.shape == (len(kernels),)
    return avg_RMSE

In [106]:
def modeling_and_prediction(X_train, y_train, X_test):
    # This function defines the model, fits training data and then 
    # does the prediction with the test data 

    gpr = GaussianProcessRegressor(kernel=RationalQuadratic() + WhiteKernel(0.05), alpha = 1e-9)
    gpr.fit(X_train, y_train)
    y_pred = gpr.predict(X_test)
    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred

In [None]:
if __name__ == "__main__":
    # Data loading
    X_train, y_train, X_test = data_loading_enc()
    # The function retrieving optimal LR parameters
    y_pred=modeling_and_prediction(X_train, y_train, X_test)
    # Save results in the required format
    dt = pd.DataFrame(y_pred)
    dt.columns = ['price_CHF']
    dt.to_csv('results.csv', index=False)
    print("\nResults file successfully generated!")

In [None]:
#K Fold cross validation for finding suitable kernels
X_train, y_train, X_test = data_loading_enc()
kernels = [DotProduct(), RBF(), Matern(), RationalQuadratic(), RBF() * ExpSineSquared(length_scale=1.0, periodicity=4)]
kernels_2 = [RationalQuadratic() + WhiteKernel(noise_level=0.05), RBF() * ExpSineSquared(length_scale=1, periodicity=4) + WhiteKernel(noise_level=0.05)]

scales = range(5, 100, 10)
Rational_Q_kernels = [RationalQuadratic(alpha=x/10) + WhiteKernel(0.05) for x in scales]

n_folds = 10
avg_RMSE = average_LR_RMSE(X_train, y_train, Rational_Q_kernels, n_folds, alpha = 1e-9)

In [119]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(train_df.head(5))
print(test_df.head(5))

   season  price_AUS  price_CHF  price_CZE  price_GER  price_ESP  price_FRA  \
0  spring        NaN   9.644028  -1.686248  -1.748076  -3.666005        NaN   
1  summer        NaN   7.246061  -2.132377  -2.054363  -3.295697  -4.104759   
2  autumn  -2.101937   7.620085  -1.910282        NaN  -3.388777        NaN   
3  winter  -2.098475   8.411894  -1.903834        NaN  -3.588235        NaN   
4  spring  -1.969687   8.926884  -1.697257  -1.331049        NaN  -3.911096   

   price_UK  price_ITA  price_POL  price_SVK  
0 -1.822720  -3.931031        NaN  -3.238197  
1 -1.826021        NaN        NaN  -3.212894  
2 -2.034409  -4.073850        NaN  -3.114061  
3 -2.214720  -4.018620  -2.330803        NaN  
4 -2.388092  -4.093946        NaN        NaN  
   season  price_AUS  price_CZE  price_GER  price_ESP  price_FRA  price_UK  \
0  spring        NaN   0.472985   0.707957        NaN  -1.136441 -0.596703   
1  summer  -1.184837   0.358019        NaN  -3.199028  -1.069695       NaN   
2  autumn