In [2]:
import pandas as pd
import numpy as np

In [3]:
def get_artif():
    train = pd.read_csv('./artificial/artificial_2x_test.tsv', names=['x', 'target'], index_col=None, header=None, sep='\t')
    test = pd.read_csv('./artificial/artificial_2x_train.tsv', names=['x', 'target'], index_col=None, header=None, sep='\t')

    return (train, test)


def get_prague():
    names = ['area', 'construction', 'ownership',   'status', 'floor', 'equip', 'cellar',     'balcony', 'target', 'nth']
    train = pd.read_csv('./pragueestateprices/pragueestateprices_train.tsv', index_col=None,     names=names, header=None, sep='\t')
    test = pd.read_csv('./pragueestateprices/pragueestateprices_test.tsv', index_col=None,  names=names, header=None, sep='\t')


    train_size = len(train)
    tog = train.append(test)

    for col in tog.columns[np.where(tog.dtypes ==   'object')]:
        tog[col] = pd.Categorical(tog[col])

    tog = tog.drop('nth', axis=1)

    train, test = (tog[:train_size], tog[train_size:])

    return (train, test)

def get_X(df):
    return pd.get_dummies(df[df.columns[:-1]])

def get_Y(df):
    return df[df.columns[-1]]


In [6]:
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [69]:
def create_models():
    return [
        (ARDRegression(n_iter = 10000), "ARD"),
        (HuberRegressor(), "Huber"),
        (LinearRegression(normalize=False), "LR"),
        (KNeighborsRegressor(n_neighbors=5), "KNN"),
        (DecisionTreeRegressor(max_depth=10, min_samples_split=5,min_samples_leaf=3), "Tree"),
        #SVR()
    ]

models = create_models()

In [70]:
def R2ToMSE(r2, df_test):
    score = (1-r2)
    score *= ((get_Y(df_test) - get_Y(df_test).mean()) ** 2).sum()
    
    return score/len(df_test)

def report(name, mse, r2):
    print(name + "\t" + "MSE: " + str(mse) + "\t sqrt(MSE): " + str(mse ** (1/2)) + "\t R2: " + str(r2))


def eval_dataset(data):
    df_train, df_test = data
    
    for (m, name) in create_models():
        m = m.fit(get_X(df_train), get_Y(df_train))
        r2 = m.score(get_X(df_test), get_Y(df_test))
    
        report(name,R2ToMSE(r2, df_test), r2)
        
print("Prague:")
eval_dataset(get_prague())
print("Artif:")
eval_dataset(get_artif())



Prague:
ARD	MSE: 3265917221772.445	 sqrt(MSE): 1807184.888652084	 R2: -0.001732387976441041
Huber	MSE: 1620268027037.4487	 sqrt(MSE): 1272897.4927453697	 R2: 0.5030263017489336
LR	MSE: 2537172101439.544	 sqrt(MSE): 1592850.307291788	 R2: 0.2217906042018708
KNN	MSE: 1312607748183.019	 sqrt(MSE): 1145690.9479362308	 R2: 0.597392828790022
Tree	MSE: 1523452479824.5696	 sqrt(MSE): 1234282.171881523	 R2: 0.532721870471943
Artif:
ARD	MSE: 0.973830503588607	 sqrt(MSE): 0.9868285076894602	 R2: 0.9997023842872007
Huber	MSE: 1.0085414933912258	 sqrt(MSE): 1.0042616657979262	 R2: 0.999691776141395
LR	MSE: 0.973752806885902	 sqrt(MSE): 0.9867891400324094	 R2: 0.9997024080323591
KNN	MSE: 18.89289600000001	 sqrt(MSE): 4.346595909444541	 R2: 0.9942260766230236
Tree	MSE: 29.554829333333192	 sqrt(MSE): 5.436435351711008	 R2: 0.990967646252312
