In [2]:
import pandas as pd
import numpy as np

In [119]:
names = ['area', 'construction', 'ownership', 'status', 'floor', 'equip', 'cellar', 'balcony', 'target', 'nth']
train = pd.read_csv('./pragueestateprices/pragueestateprices_train.tsv', index_col=None, names=names, header=None, sep='\t')
test = pd.read_csv('./pragueestateprices/pragueestateprices_test.tsv', index_col=None, names=names, header=None, sep='\t')


train_size = len(train)
tog = train.append(test)

for col in tog.columns[np.where(tog.dtypes == 'object')]:
    tog[col] = pd.Categorical(tog[col])
    
tog = tog.drop('nth', axis=1)

train, test = (tog[:train_size], tog[train_size:])
train.head(5)

Unnamed: 0,area,construction,ownership,status,floor,equip,cellar,balcony,target
0,105,Budova:Smíšená,Vlastnictví:Osobní,Stavobjektu:Novostavba,1,Zařízeno:Ne,Sklep:Ne,Balkón:Ano,8646851
1,61,Budova:Cihlová,Vlastnictví:Družstevní,Stavobjektu:Novostavba,5,Zařízeno:Ne,Sklep:Ne,Balkón:Ano,3326367
2,37,Budova:Cihlová,Vlastnictví:Osobní,Stavobjektu:Velmidobrý,2,Zařízeno:Částečně,Sklep:Ano,Balkón:Ne,1990000
3,71,Budova:Cihlová,Vlastnictví:Osobní,Stavobjektu:Novostavba,1,Zařízeno:Ne,Sklep:Ano,Balkón:Ano,7028000
4,42,Budova:Panelová,Vlastnictví:Družstevní,Stavobjektu:Velmidobrý,1,Zařízeno:Ne,Sklep:Ano,Balkón:Ne,1550000


In [132]:
import numpy as np
import pandas as pd

class Model():
    def Build(self, inputs, targets):
        raise NotImplementedError()
    
    def Predict(self, input):
        raise NotImplementedError()


class LinRegression(Model):
    '''
    Linear regression model.
    '''
    def __init__(self, lr = 0.0001, lr_decay = 0.99, epochs = 2):
        self.lr = lr
        self.lr_decay = lr_decay
        self.epochs = epochs

    def Build(self, inputs, targets):       

        # prepare weights + inputs array, prepend column of 1 for bias
        dim = inputs.shape[1]
        w = np.random.normal(loc = 0.0, scale = 1/2, size = dim+1)
        b = np.insert(inputs, 0, values=1, axis=1) 

        lr = self.lr

        # calculate weights iteratively and save them, decay lr
        for _ in range(self.epochs):
            w = self.__update_weigts(b, targets, w, lr)
            lr *= self.lr_decay
        
        self.w = w


    def __update_weigts(self, b, targets, w, lr):

        # update weights for each input data
        for i in range(0, len(targets)):
            result = np.dot(b[i], w)
            w = w + lr * b[i] * (targets[i] - result)
        return w


    def Predict(self, input):
        b = np.insert(input, 0, values=1, axis=0) 

        result = np.dot(b, self.w)
        return result



In [90]:
train = pd.read_csv('./artificial/artificial_2x_test.tsv', index_col=None, header=None, sep='\t')
test = pd.read_csv('./artificial/artificial_2x_train.tsv', index_col=None, header=None, sep='\t')

train.head(5)


Unnamed: 0,0,1
0,12,24.0
1,40,82.3
2,94,187.6
3,16,33.5
4,67,134.9


In [122]:
def get_X(df):
    return pd.get_dummies(df[df.columns[:-1]])

def get_Y(df):
    return df[df.columns[-1]]



In [133]:
model = LinRegression(epochs = 100)
get_X(train)

model.Build(get_X(train).values, get_Y(train).values)

In [135]:
x_test = get_X(test).values
y_test = get_Y(test).values

err = 0
for i in range(0, len(test)):
    print(i, model.Predict(x_test[i]), y_test[i], model.Predict(x_test[i]) - y_test[i])
    err += (model.Predict(x_test[i]) - y_test[i]) ** 2
    
err /= len(test)
err ** (1/2)

0 1763782.9270657557 1850000 -86217.0729342443
1 4150408.572403304 2790000 1360408.5724033038
2 2924425.133828581 2390000 534425.1338285808
3 4914509.772305039 5350000 -435490.22769496124
4 3826609.5062818453 6346000 -2519390.4937181547
5 3406930.415110314 2080000 1326930.4151103138
6 4528274.066762603 7600000 -3071725.933237397
7 3542768.542000571 3500000 42768.5420005708
8 2959228.434225566 2200000 759228.4342255662
9 5316692.881262469 7950000 -2633307.118737531
10 3221564.7718386617 4600000 -1378435.2281613383
11 2540714.9557994423 2950000 -409285.04420055775
12 2269635.622601782 2390000 -120364.37739821803
13 2037919.8051825021 1980000 57919.80518250214
14 2486076.0955404295 3344485 -858408.9044595705
15 3192357.3413445367 3299000 -106642.65865546325
16 3433821.843228618 4020965 -587143.1567713819
17 3801359.451165668 5426634 -1625274.5488343318
18 1705894.9446605742 1353520 352374.9446605742
19 3118177.2076772437 3690000 -571822.7923227563
20 2288284.919947129 2726213 -437928.0800

1355042.2065020797

In [136]:
model.w


array([ 1.66410563e+04,  4.94477939e+04, -5.21936818e+04,  3.65859043e+04,
        4.65914462e+01, -3.63551631e+04, -5.27877402e+03,  2.16424333e+04,
       -2.72505742e+04,  4.38917987e+04,  2.85851944e+03,  6.36213220e+04,
        2.27426838e+04, -8.39310544e-01, -7.25807259e+04, -3.74055285e-01,
        8.28627447e-01,  2.17045887e+04, -5.06398712e+03, -2.71296261e+04,
        4.37700187e+04,  4.84401208e+04, -3.17983042e+04])