# Import Library

This section imports required libraries. 

In [1]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import mean_squared_error as mse
import random as r
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import HuberRegressor, Ridge, TheilSenRegressor
from sklearn.model_selection import KFold
import warnings
import copy
warnings.filterwarnings('ignore')

# Read Files From Path

This section reads files and converts to NumPy array from Pandas Dataframe. 

## Inputs: No Input

In [2]:
def read_data_from_file():
    path = "Data/"
    train_t0_df = pd.read_csv(path + "train_t0.csv")
    train_t1_df = pd.read_csv(path + "train_t1.csv")
    test_t0_df = pd.read_csv(path + "test_t0.csv")    
    train_t0 = train_t0_df.to_numpy()[:,1:]
    train_t1 = train_t1_df.to_numpy()[:,1:]
    test_t0 = test_t0_df.to_numpy()[:,1:]
    
    return train_t0,train_t1,test_t0

# Preprocessing

## Outlier Eliminations

In [3]:
def eliminate_outliers(train_t0,train_t1):
    
    out = LocalOutlierFactor(n_neighbors=30)
    il_ind = np.where(out.fit_predict(train_t0) == 1)
    train_t0_il = train_t0[il_ind]
    train_t1_il = train_t1[il_ind]
    return train_t0_il,train_t1_il

## Dimension Reduction

In [4]:
def dim_reduction(train_t0,test_t0):
    transformer = PCA(n_components=21)
    X_train = transformer.fit_transform(train_t0)
    X_test = transformer.transform(test_t0)
    
    return X_train, X_test

In [5]:
def scale_data(train_t0,train_t1,test_t0):
    scaler = StandardScaler()
    train_t0_transformed = scaler.fit_transform(train_t0)
    train_t1_transformed = scaler.transform(train_t1)
    test_t0_transformed = scaler.transform(test_t0)
    return train_t0_transformed, train_t1_transformed, test_t0_transformed, scaler

In [6]:
def backscale_data(test_t0,scaler):
    return scaler.inverse_transform(test_t0)

In [7]:
def preprocess_data(train_t0,train_t1,test_t0):
    train_t0_el, train_t1_el = eliminate_outliers(train_t0,train_t1)
#     train_t0_reducted, test_t0_reducted = dim_reduction(train_t0_el,test_t0)
    return train_t0_el, train_t1_el, test_t0, scaler

# Training Models

In [8]:
def train_model(t0_train,t1_train,model):
    
    models=[]
    for i in range(t0_train.shape[1]):
        clf = copy.deepcopy(model)
        
        clf.fit(t0_train[:,i].reshape(-1,1),t1_train[:,i].reshape(-1,1))
        models.append(clf)
    return models

In [9]:
def train_models(train_t0,train_t1):
    models = [ 
            Ridge(alpha = 5*1e-2), 
            TheilSenRegressor(random_state=0,n_subsamples = 20), 
            HuberRegressor(alpha = 5*1e-2)
    ]
    
    ret_models = []
    i = 0
    for model in models:
        if (i == 0):
            train_t0_il,train_t1_il = eliminate_outliers(train_t0,train_t1)
            models_curr_mod = train_model(train_t0_il,train_t1_il,model)
            ret_models.append(models_curr_mod)
            i+=1
        else:
            models_curr_mod = train_model(train_t0,train_t1,model)
            ret_models.append(models_curr_mod)
        
    return ret_models

# Prediction

In [10]:
def predict_model(t0_test,clfs):
    
    predictions=[]
    for i in range(len(clfs)):
        pred = clfs[i].predict(t0_test[:,i].reshape(-1,1))
        predictions.append(pred.T)
    
    predictions=np.transpose(np.array(predictions)).reshape(-1,len(t0_test[0]))
    return predictions

In [11]:
def test_models(test_t0,models):
    y_test_results = []
        
    for model in models:
        y_test = predict_model(test_t0,model)
        y_test_results.append(y_test)
        
    y_test1 =  y_test_results[0]
    y_test2 =  y_test_results[1]
    y_test3 =  y_test_results[2]
    predictions = (y_test1 + 3 * y_test2 + y_test3) / 5 
    
    return predictions

# Melt Results

This part is done since Kaggle doesn't allow a matrix solutions. 

In [12]:
def melt_results(y_test):
    return y_test.flatten()

# Write Melted Predictions to CSV

This part converts predictions to CSV for uploading Kaggle. 

In [13]:
def write_to_file(predictions,filename):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['ID', 'predicted']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for i in range(len(predictions)):
            writer.writerow({'ID': i, 'predicted': predictions[i]})

# Function Calls

In [17]:
def q1():
    train_t0, train_t1, test_t0 = read_data_from_file()
    ret_models = train_models(train_t0, train_t1)
    predictions = test_models(test_t0, ret_models)
    melted_results = melt_results(predictions)
    write_to_file(melted_results,"Kaggle_Submit.csv")

In [18]:
q1()

# Code With 5-fold CV


In [19]:
def calculate_mse(actual, predicted):
    return mse(melt_results(predicted), melt_results(actual))

In [20]:
def k_fold(train_t0, train_t1):
    kf = KFold(n_splits=5, random_state=r.seed(1), shuffle=True)
    final_prediction = np.zeros(train_t0.shape[0]*595).reshape(train_t0.shape[0],595)
    for train_index, test_index in kf.split(train_t0):
        X_train, X_test = train_t0[train_index], train_t0[test_index]
        y_train, y_test = train_t1[train_index], train_t1[test_index]
        ret_models = train_models(X_train,y_train)
        prediction = test_models(X_test,ret_models)
        
        for i in range(0, len(test_index)):
            final_prediction[test_index[i],:] = prediction[i]
        
        mse = calculate_mse(y_test, prediction)
        print("MSE: ",mse)
    return final_prediction

In [21]:
def q2_5fold():
    train_t0,train_t1,test_t0 = read_data_from_file()
    predictions = k_fold(train_t0, train_t1)
    mse = calculate_mse(train_t1, predictions)
    print("MSE of Overall: ",mse)
    melted_results = melt_results(predictions)
    write_to_file(melted_results,"predictions.csv")

In [22]:
q2_5fold()

MSE:  0.002199392610051998
MSE:  0.002393543799911158
MSE:  0.0023079263995283376
MSE:  0.003624242847320846
MSE:  0.009352010501099461
MSE of Overall:  0.0039754232315823606
