In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# loading the training and testing data
boston_train = np.genfromtxt('boston_x_y_train.csv', delimiter = ',')
x_train = boston_train[:, 0:boston_train.shape[1]-1]
y_train = boston_train[:, boston_train.shape[1]-1]
x_test = np.genfromtxt("boston_x_test.csv", delimiter = ",")

In [3]:
boston_train

array([[-0.40784991, -0.48772236, -1.2660231 , ...,  0.41057102,
        -1.09799011, 37.9       ],
       [-0.40737368, -0.48772236,  0.24705682, ...,  0.29116915,
        -0.52047412, 21.4       ],
       [ 0.1251786 , -0.48772236,  1.01599907, ..., -3.79579542,
         0.89107588, 12.7       ],
       ...,
       [-0.40831101, -0.48772236,  0.24705682, ...,  0.33206621,
        -0.33404299, 20.8       ],
       [-0.41061997, -0.48772236, -1.15221381, ...,  0.203235  ,
        -0.74475218, 22.6       ],
       [ 0.34290895, -0.48772236,  1.01599907, ...,  0.38787479,
        -1.35871335, 50.        ]])

In [4]:
# printing the data
print(boston_train)

[[-0.40784991 -0.48772236 -1.2660231  ...  0.41057102 -1.09799011
  37.9       ]
 [-0.40737368 -0.48772236  0.24705682 ...  0.29116915 -0.52047412
  21.4       ]
 [ 0.1251786  -0.48772236  1.01599907 ... -3.79579542  0.89107588
  12.7       ]
 ...
 [-0.40831101 -0.48772236  0.24705682 ...  0.33206621 -0.33404299
  20.8       ]
 [-0.41061997 -0.48772236 -1.15221381 ...  0.203235   -0.74475218
  22.6       ]
 [ 0.34290895 -0.48772236  1.01599907 ...  0.38787479 -1.35871335
  50.        ]]


In [5]:
# adding dummy data to training data
df = pd.DataFrame(x_train)
N = x_train.shape[1]
count =1
for i in range(N):
    for j in range(i, N):
        df[count] = df[df.columns[i]] * df[df.columns[j]]
        count += 1

for i in range(N):
    for j in range(i, N):
        for k in range(j, N):
            df[count] = df[df.columns[i]]*df[df.columns[j]]*df[df.columns[k]]
            count += 1

x_train = df.values

# adding dummy data to testing data
df = pd.DataFrame(x_test)
N = x_test.shape[1]
count =1
for i in range(N):
    for j in range(i, N):
        df[count] = df[df.columns[i]] * df[df.columns[j]]
        count += 1

for i in range(N):
    for j in range(i, N):
        for k in range(j, N):
            df[count] = df[df.columns[i]]*df[df.columns[j]]*df[df.columns[k]]
            count += 1

x_test = df.values

In [6]:
# feature Scaling
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [7]:
# step_gradient - find the new regression coefficients
def step_gradient(x, y, learning_rate, m):
    M = x.shape[0]
    m_slope = (-2/M)*(((y - (x*m).sum(axis=1)).reshape(-1, 1))*x).sum(axis=0)
    m = m - learning_rate * m_slope
    return m

In [8]:
# generic_gd - calculate regression coefficient with specified learning_rate and num_iterations
def generic_gd(x, y, learning_rate, num_iterations):
    m = np.zeros(x.shape[1])
    for i in range(num_iterations):
        m = step_gradient(x, y, learning_rate, m)
        print(i, "Cost :", cost(x, y, m))
    return m

In [9]:
# cost - finding the error on trainig_data
def cost(x, y, m):
    return ((y - (x*m).sum(axis=1)) ** 2).mean()

In [10]:
# run - run gradient descent on training data
def run(x, y):
    df = pd.DataFrame(x)
    df[x.shape[1]] = 1
    x = df.values
    learning_rate = 0.0015
    num_iterations = 1000
    m = generic_gd(x, y, learning_rate, num_iterations)
    print(m)
    return m
m = run(x_train, y_train)

0 Cost : 595.6709607235008
1 Cost : 592.4135271116637
2 Cost : 589.2442698509406
3 Cost : 586.1216997560581
4 Cost : 583.029473279951
5 Cost : 579.961095144854
6 Cost : 576.9139278561229
7 Cost : 573.8868457991686
8 Cost : 570.8793165585589
9 Cost : 567.8910411390185
10 Cost : 564.921813053411
11 Cost : 561.9714631197571
12 Cost : 559.0398378225194
13 Cost : 556.1267908182022
14 Cost : 553.2321795887898
15 Cost : 550.3558641115793
16 Cost : 547.4977063191202
17 Cost : 544.6575698690319
18 Cost : 541.8353200356307
19 Cost : 539.030823649703
20 Cost : 536.2439490575698
21 Cost : 533.4745660881282
22 Cost : 530.7225460234317
23 Cost : 527.9877615710553
24 Cost : 525.2700868375533
25 Cost : 522.5693973027255
26 Cost : 519.8855697945695
27 Cost : 517.2184824648647
28 Cost : 514.5680147653543
29 Cost : 511.93404742450366
30 Cost : 509.31646242481895
31 Cost : 506.71514298070815
32 Cost : 504.12997351687187
33 Cost : 501.560839647209
34 Cost : 499.0076281542233
35 Cost : 496.4702269689183
36 

301 Cost : 161.66221227016908
302 Cost : 161.14944117586262
303 Cost : 160.63969426463737
304 Cost : 160.13295352405058
305 Cost : 159.6292010495468
306 Cost : 159.12841904380403
307 Cost : 158.63058981608518
308 Cost : 158.13569578159255
309 Cost : 157.6437194608273
310 Cost : 157.1546434789519
311 Cost : 156.66845056515837
312 Cost : 156.18512355203814
313 Cost : 155.70464537495766
314 Cost : 155.22699907143743
315 Cost : 154.75216778053434
316 Cost : 154.28013474222865
317 Cost : 153.81088329681455
318 Cost : 153.34439688429435
319 Cost : 152.88065904377635
320 Cost : 152.4196534128768
321 Cost : 151.96136372712587
322 Cost : 151.50577381937612
323 Cost : 151.0528676192162
324 Cost : 150.60262915238638
325 Cost : 150.1550425402003
326 Cost : 149.7100919989671
327 Cost : 149.26776183941965
328 Cost : 148.82803646614548
329 Cost : 148.39090037702098
330 Cost : 147.95633816264905
331 Cost : 147.524334505801
332 Cost : 147.09487418086118
333 Cost : 146.66794205327506
334 Cost : 146.2435

619 Cost : 87.65179358779962
620 Cost : 87.57159804086399
621 Cost : 87.49185963084595
622 Cost : 87.41257566890359
623 Cost : 87.33374348217373
624 Cost : 87.2553604136757
625 Cost : 87.1774238222172
626 Cost : 87.09993108229968
627 Cost : 87.02287958402464
628 Cost : 86.9462667330008
629 Cost : 86.87008995025143
630 Cost : 86.79434667212212
631 Cost : 86.71903435018967
632 Cost : 86.64415045117107
633 Cost : 86.56969245683305
634 Cost : 86.49565786390215
635 Cost : 86.4220441839759
636 Cost : 86.34884894343358
637 Cost : 86.2760696833483
638 Cost : 86.20370395939908
639 Cost : 86.13174934178402
640 Cost : 86.06020341513326
641 Cost : 85.98906377842295
642 Cost : 85.9183280448902
643 Cost : 85.84799384194682
644 Cost : 85.77805881109605
645 Cost : 85.70852060784752
646 Cost : 85.639376901634
647 Cost : 85.57062537572828
648 Cost : 85.50226372716088
649 Cost : 85.4342896666374
650 Cost : 85.36670091845733
651 Cost : 85.29949522043287
652 Cost : 85.2326703238082
653 Cost : 85.1662239931

912 Cost : 76.08132600712105
913 Cost : 76.06530076716969
914 Cost : 76.04935846137545
915 Cost : 76.03349861629076
916 Cost : 76.01772076124514
917 Cost : 76.00202442832908
918 Cost : 75.98640915237749
919 Cost : 75.97087447095356
920 Cost : 75.95541992433225
921 Cost : 75.94004505548449
922 Cost : 75.924749410061
923 Cost : 75.90953253637663
924 Cost : 75.89439398539429
925 Cost : 75.87933331070943
926 Cost : 75.86435006853435
927 Cost : 75.8494438176827
928 Cost : 75.8346141195539
929 Cost : 75.81986053811815
930 Cost : 75.8051826399007
931 Cost : 75.79057999396733
932 Cost : 75.77605217190842
933 Cost : 75.76159874782499
934 Cost : 75.74721929831288
935 Cost : 75.73291340244879
936 Cost : 75.71868064177467
937 Cost : 75.70452060028391
938 Cost : 75.6904328644063
939 Cost : 75.67641702299397
940 Cost : 75.66247266730653
941 Cost : 75.64859939099736
942 Cost : 75.63479679009893
943 Cost : 75.62106446300895
944 Cost : 75.60740201047638
945 Cost : 75.59380903558733
946 Cost : 75.580285

In [11]:
# predict - predicting values for x_test
def predict(x,m):
    return (x*m).sum(axis=1)

In [12]:
df = pd.DataFrame(x_test)
df[x_test.shape[1]] = 1
x_test = df.values
y_pred = predict(x_test, m)
print(y_pred)

[10.56102779 22.89648149 22.93122063 22.92934449 22.90532338 22.27493544
 22.92728645 22.92493907 22.64963025 22.91680968 22.82446757 22.597315
 22.93706383 22.9265963  22.32049764 22.93560514 22.86916605 22.93057186
 22.9291709  22.92653841 22.89554728 21.98969022 22.44320017 22.92660454
 22.92694757 19.5709992  22.89242205 22.89794074 22.90418253 22.80187152
 22.62543174 22.91729739 22.92404642 22.93864234 12.7791424  22.91199874
 22.93480758 22.86245913 22.93200584 15.23287334 14.07509397 22.911635
 22.92320746 22.80933095 19.66309754 14.35338522 22.73199371 22.7972748
 22.94450994 18.85883889 17.29314421 22.93542333 18.75504653 22.87829538
 22.63949572 22.92721206 22.84856021 22.87757598 22.60208174 19.02785548
  6.64100914 22.74604818 22.90238445 22.80890483 20.03557266 17.77423467
 22.72653703 19.79627244 22.93108428 21.42150599 22.89949088 21.99531679
 22.05832324 12.44893691 22.94357199 22.92902216 22.91695848 22.88292591
 15.32844712 22.93064625 21.17506863 22.89701889 22.9225

In [13]:
np.savetxt('Predictions_boston.csv',y_pred, fmt = "%.8f")