### Oben Özgür-150190719
### Ömer Faruk Topal-150170029
### Muzaffer Aydın-150170086
### İhsan Mert Şahin-150170108
### Hakan Toker-150170726

In [1]:
import numpy as np
import pandas as pd
import random as r
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.neighbors import LocalOutlierFactor
from sklearn.kernel_ridge import KernelRidge

r.seed(1)


def load_data():
    """
    load_data function reads the low resolution and high resolution csv files for testing,
    reads the low resolution csv file to predict and returns them 3 different numpy arrays.
    As file names are predetermined, no input is needed.
    """
    lr = pd.read_csv("train_LR.csv")
    hr = pd.read_csv("train_HR.csv")
    lr_test = pd.read_csv("test_LR.csv")
    lr_np = lr.values
    hr_np = hr.values
    lr_test_np = lr_test.values
    return lr_np, hr_np, lr_test_np
    
    
def write_output(predicted):
    """
    write_output function takes the precicted array of nx3220020 and meltes it.
    Upon melting the array, it is written the 'output.csv' file.
    """
    melted = predicted.flatten()
    DF = pd.DataFrame(melted)
    DF.to_csv('output.csv', index=True)  
    

def preprocess(X,y,X_test,variance_threshold=0):
    """
    preprocess function performs the preprocess operations in our pipeline to the given training set
    Upon finishing the operations, it just returns the processed values.
    """
    
    #Variance threshold check, if no variance -> drop the feature
    var_threshold = VarianceThreshold(threshold=variance_threshold) #Get features that has some variance
    var_threshold.fit(X) #Fit it
    #Drop zero variance features, they are unlearnable
    X = var_threshold.transform(X) 
    X_test = var_threshold.transform(X_test) 
    
    """
    LocalOutlierFactor algorithm is used to detect outliers.
    If an outlier is detected, it will be replaced by the mean of the set.
    """
    clf = LocalOutlierFactor(n_neighbors = 2) #Get outlier classifier
    clf.fit(X) #Fit to the set
    y_pred_outliers = clf.fit_predict(X) #Predict outliers; 1 if not, -1 if putlier

    num_of_outliers = 0
    outliers = [] #outlier indexes

    for i in range(y_pred_outliers.shape[0]):
        if y_pred_outliers[i] == -1: 
            outliers.append(i) #get outliers indexes
            num_of_outliers += 1 #count outliers
    
    #Replace the outlier with mean of the features for the training set
    X[outliers] = X.mean(axis=0)
    y[outliers] = y.mean(axis=0)
    
    print(str(num_of_outliers) + " outlier(s) detected and replaced by means of the set.") #prompt
    
    return X,y, X_test #Set processed data


def train_model(X_train,y_train,X_test,lambda_term):
    """
    In training, 'KernelRidge' regressor is used since it is a multioutput regressor by its nature
    Only hyperparameter for the regressor is the alphda which defines the strength of regularization
    """
    regressor = KernelRidge(alpha=lambda_term) #Get regressor
    regressor.fit(X_train, y_train) #Fit the train data
    y_pred = regressor.predict(X_test) #Predict test data
    return y_pred #return predicted high resolution image


def CV_5(X,y,lambda_term):
    """
    Provides cross validation to check the required metrics with train data
    """
    kf = KFold(n_splits=5) #5 Folds
    r.seed(1) #Set random seed
    fold = 1 #fold counter
    ms_errors = [] #MSE list
    ma_erros = [] #MAE list
    pearsons = [] #Pearson coefficient list
    
    for train, test in kf.split(X):
        print("\n-----FOLD" + str(fold) + "-----")
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] #Split data
        
        X_train, y_train, X_test = preprocess(X_train,y_train,X_test) #Preprocess train data
        
        y_pred = train_model(X_train,y_train,X_test,lambda_term) #Predict
        
        #Return metrics
        ms_errors.append(mean_squared_error(y_pred, y_test))
        ma_erros.append(mean_absolute_error(y_pred, y_test))
        pearsons.append(pearsonr(y_pred.flatten(), y_test.flatten())[0])
        print("MSE: " + str(mean_squared_error(y_pred, y_test)))
        print("MAE: " + str(mean_absolute_error(y_pred, y_test)))
        print("Pearson: " + str(pearsonr(y_pred.flatten(), y_test.flatten())[0]))
        fold += 1 #Iterate

    #Average metrics across all folds
    average_mse = sum(ms_errors)/len(ms_errors)
    print("\nAverage MSE: "+str(average_mse))
    
    average_mae = sum(ma_erros)/len(ma_erros)
    print("Average MAE: "+str(average_mae))
    
    average_pearson = sum(pearsons)/len(pearsons)
    print("Average Pearson: "+str(average_pearson))

    
#----------MAIN PROGRAM----------#
lr_train, hr_train, lr_test = load_data()

CV_5(lr_train,hr_train,250) #Use 250 as regularization term

#Training actual data
print("\n\n--------------------")
print("Making prediction...")

lr_train, hr_train, lr_test = preprocess(lr_train, hr_train, lr_test) #Process whole training data
predicted = train_model(lr_train ,hr_train, lr_test, 230) #Train with lamda=230 as regularization term
write_output(predicted) #Write the predicted output

print("Learning procedure is done and written to 'output.csv' in melted form.")


-----FOLD1-----
1 outlier(s) detected and replaced by means of the set.
MSE: 0.02321859403454745
MAE: 0.12199322371207395
Pearson: 0.7052614741226307

-----FOLD2-----
0 outlier(s) detected and replaced by means of the set.
MSE: 0.025570723630232932
MAE: 0.12721453342542471
Pearson: 0.7294299483146065

-----FOLD3-----
1 outlier(s) detected and replaced by means of the set.
MSE: 0.02272445560907092
MAE: 0.12086838419625227
Pearson: 0.7200137166487364

-----FOLD4-----
1 outlier(s) detected and replaced by means of the set.
MSE: 0.023907339587979453
MAE: 0.12391449318088549
Pearson: 0.7360036734738696

-----FOLD5-----
1 outlier(s) detected and replaced by means of the set.
MSE: 0.023822069230219917
MAE: 0.12313871256646622
Pearson: 0.7197635122879267

Average MSE: 0.023848636418410136
Average MAE: 0.12342586941622054
Average Pearson: 0.722094464969554


--------------------
Making prediction...
1 outlier(s) detected and replaced by means of the set.
Learning procedure is done and written 