In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# loading the training and testing data
boston_train = np.genfromtxt('boston_x_y_train.csv', delimiter = ',')
x_train = boston_train[:, 0:boston_train.shape[1]-1]
y_train = boston_train[:, boston_train.shape[1]-1]
x_test = np.genfromtxt("boston_x_test.csv", delimiter = ",")

In [3]:
boston_train

array([[-0.40784991, -0.48772236, -1.2660231 , ...,  0.41057102,
        -1.09799011, 37.9       ],
       [-0.40737368, -0.48772236,  0.24705682, ...,  0.29116915,
        -0.52047412, 21.4       ],
       [ 0.1251786 , -0.48772236,  1.01599907, ..., -3.79579542,
         0.89107588, 12.7       ],
       ...,
       [-0.40831101, -0.48772236,  0.24705682, ...,  0.33206621,
        -0.33404299, 20.8       ],
       [-0.41061997, -0.48772236, -1.15221381, ...,  0.203235  ,
        -0.74475218, 22.6       ],
       [ 0.34290895, -0.48772236,  1.01599907, ...,  0.38787479,
        -1.35871335, 50.        ]])

In [4]:
# printing the data
print(boston_train)

[[-0.40784991 -0.48772236 -1.2660231  ...  0.41057102 -1.09799011
  37.9       ]
 [-0.40737368 -0.48772236  0.24705682 ...  0.29116915 -0.52047412
  21.4       ]
 [ 0.1251786  -0.48772236  1.01599907 ... -3.79579542  0.89107588
  12.7       ]
 ...
 [-0.40831101 -0.48772236  0.24705682 ...  0.33206621 -0.33404299
  20.8       ]
 [-0.41061997 -0.48772236 -1.15221381 ...  0.203235   -0.74475218
  22.6       ]
 [ 0.34290895 -0.48772236  1.01599907 ...  0.38787479 -1.35871335
  50.        ]]


In [5]:
# adding dummy data to training data
df = pd.DataFrame(x_train)
N = x_train.shape[1]
count =1
for i in range(N):
    for j in range(i, N):
        df[count] = df[df.columns[i]] * df[df.columns[j]]
        count += 1

for i in range(N):
    for j in range(i, N):
        for k in range(j, N):
            df[count] = df[df.columns[i]]*df[df.columns[j]]*df[df.columns[k]]
            count += 1

x_train = df.values

# adding dummy data to testing data
df = pd.DataFrame(x_test)
N = x_test.shape[1]
count =1
for i in range(N):
    for j in range(i, N):
        df[count] = df[df.columns[i]] * df[df.columns[j]]
        count += 1

for i in range(N):
    for j in range(i, N):
        for k in range(j, N):
            df[count] = df[df.columns[i]]*df[df.columns[j]]*df[df.columns[k]]
            count += 1

x_test = df.values

In [6]:
# feature Scaling
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [7]:
# step_gradient - find the new regression coefficients
def step_gradient(x, y, learning_rate, m):
    M = x.shape[0]
    m_slope = (-2/M)*(((y - (x*m).sum(axis=1)).reshape(-1, 1))*x).sum(axis=0)
    m = m - learning_rate * m_slope
    return m

In [8]:
# generic_gd - calculate regression coefficient with specified learning_rate and num_iterations
def generic_gd(x, y, learning_rate, num_iterations):
    m = np.zeros(x.shape[1])
    for i in range(num_iterations):
        m = step_gradient(x, y, learning_rate, m)
        print(i, "Cost :", cost(x, y, m))
    return m

In [9]:
# cost - finding the error on trainig_data
def cost(x, y, m):
    return ((y - (x*m).sum(axis=1)) ** 2).mean()

In [10]:
# run - run gradient descent on training data
def run(x, y):
    df = pd.DataFrame(x)
    df[x.shape[1]] = 1
    x = df.values
    learning_rate = 0.0015
    num_iterations = 1000
    m = generic_gd(x, y, learning_rate, num_iterations)
    print(m)
    return m
m = run(x_train, y_train)

0 Cost : 595.6709607235008
1 Cost : 592.4135271116637
2 Cost : 589.2442698509406
3 Cost : 586.1216997560581
4 Cost : 583.029473279951
5 Cost : 579.961095144854
6 Cost : 576.9139278561229
7 Cost : 573.8868457991686
8 Cost : 570.8793165585589
9 Cost : 567.8910411390185
10 Cost : 564.921813053411
11 Cost : 561.9714631197571
12 Cost : 559.0398378225194
13 Cost : 556.1267908182022
14 Cost : 553.2321795887898
15 Cost : 550.3558641115793
16 Cost : 547.4977063191202
17 Cost : 544.6575698690319
18 Cost : 541.8353200356307
19 Cost : 539.030823649703
20 Cost : 536.2439490575698
21 Cost : 533.4745660881282
22 Cost : 530.7225460234317
23 Cost : 527.9877615710553
24 Cost : 525.2700868375533
25 Cost : 522.5693973027255
26 Cost : 519.8855697945695
27 Cost : 517.2184824648647
28 Cost : 514.5680147653543
29 Cost : 511.93404742450366
30 Cost : 509.31646242481895
31 Cost : 506.71514298070815
32 Cost : 504.12997351687187
33 Cost : 501.560839647209
34 Cost : 499.0076281542233
35 Cost : 496.4702269689183
36 

330 Cost : 147.95633816264905
331 Cost : 147.524334505801
332 Cost : 147.09487418086118
333 Cost : 146.66794205327506
334 Cost : 146.2435230790012
335 Cost : 145.82160230396607
336 Cost : 145.4021648635223
337 Cost : 144.98519598191115
338 Cost : 144.57068097172652
339 Cost : 144.15860523338404
340 Cost : 143.74895425459235
341 Cost : 143.34171360982833
342 Cost : 142.93686895981426
343 Cost : 142.5344060510001
344 Cost : 142.1343107150478
345 Cost : 141.73656886831805
346 Cost : 141.34116651136208
347 Cost : 140.94808972841494
348 Cost : 140.55732468689268
349 Cost : 140.1688576368924
350 Cost : 139.7826749106951
351 Cost : 139.39876292227237
352 Cost : 139.01710816679474
353 Cost : 138.63769722014467
354 Cost : 138.26051673843114
355 Cost : 137.88555345750794
356 Cost : 137.5127941924948
357 Cost : 137.142225837301
358 Cost : 136.77383536415257
359 Cost : 136.407609823122
360 Cost : 136.04353634166054
361 Cost : 135.68160212413392
362 Cost : 135.32179445136066
363 Cost : 134.96410068

647 Cost : 85.57062537572828
648 Cost : 85.50226372716088
649 Cost : 85.4342896666374
650 Cost : 85.36670091845733
651 Cost : 85.29949522043287
652 Cost : 85.2326703238082
653 Cost : 85.16622399317929
654 Cost : 85.10015400641457
655 Cost : 85.03445815457532
656 Cost : 84.96913424183707
657 Cost : 84.90418008541171
658 Cost : 84.83959351546923
659 Cost : 84.77537237506061
660 Cost : 84.71151452004106
661 Cost : 84.64801781899328
662 Cost : 84.58488015315227
663 Cost : 84.5220994163287
664 Cost : 84.45967351483517
665 Cost : 84.39760036741067
666 Cost : 84.33587790514684
667 Cost : 84.2745040714143
668 Cost : 84.21347682178916
669 Cost : 84.15279412398056
670 Cost : 84.09245395775797
671 Cost : 84.03245431487947
672 Cost : 83.97279319901992
673 Cost : 83.91346862570066
674 Cost : 83.85447862221754
675 Cost : 83.79582122757209
676 Cost : 83.73749449240054
677 Cost : 83.67949647890539
678 Cost : 83.62182526078551
679 Cost : 83.56447892316847
680 Cost : 83.5074555625415
681 Cost : 83.45075

985 Cost : 75.10277909225039
986 Cost : 75.09172000717038
987 Cost : 75.08071566707173
988 Cost : 75.06976576346482
989 Cost : 75.05886998965923
990 Cost : 75.04802804075375
991 Cost : 75.03723961362557
992 Cost : 75.02650440691941
993 Cost : 75.01582212103773
994 Cost : 75.0051924581296
995 Cost : 74.99461512208096
996 Cost : 74.9840898185041
997 Cost : 74.97361625472726
998 Cost : 74.96319413978492
999 Cost : 74.95282318440722
[-3.83433676e+00 -4.08568119e-01  2.07855658e-02  1.17496047e-01
  1.20355261e-01  1.03229356e-01  8.35544018e-02  6.58700175e-02
  5.10471639e-02  3.89151879e-02  2.90575092e-02  2.10543416e-02
  1.45468684e-02  9.24433493e-03  1.17496047e-01  1.20355261e-01
  1.03229356e-01  8.35544018e-02  6.58700175e-02  5.10471639e-02
  3.89151879e-02  2.90575092e-02  2.10543416e-02  1.45468684e-02
  9.24433493e-03  4.91500469e-03  1.03229356e-01  8.35544018e-02
  6.58700175e-02  5.10471639e-02  3.89151879e-02  2.90575092e-02
  2.10543416e-02  1.45468684e-02  9.24433493e-0

In [11]:
# predict - predicting values for x_test
def predict(x,m):
    return (x*m).sum(axis=1)

In [12]:
df = pd.DataFrame(x_test)
df[x_test.shape[1]] = 1
x_test = df.values
y_pred = predict(x_test, m)
print(y_pred)

[10.56102779 22.89648149 22.93122063 22.92934449 22.90532338 22.27493544
 22.92728645 22.92493907 22.64963025 22.91680968 22.82446757 22.597315
 22.93706383 22.9265963  22.32049764 22.93560514 22.86916605 22.93057186
 22.9291709  22.92653841 22.89554728 21.98969022 22.44320017 22.92660454
 22.92694757 19.5709992  22.89242205 22.89794074 22.90418253 22.80187152
 22.62543174 22.91729739 22.92404642 22.93864234 12.7791424  22.91199874
 22.93480758 22.86245913 22.93200584 15.23287334 14.07509397 22.911635
 22.92320746 22.80933095 19.66309754 14.35338522 22.73199371 22.7972748
 22.94450994 18.85883889 17.29314421 22.93542333 18.75504653 22.87829538
 22.63949572 22.92721206 22.84856021 22.87757598 22.60208174 19.02785548
  6.64100914 22.74604818 22.90238445 22.80890483 20.03557266 17.77423467
 22.72653703 19.79627244 22.93108428 21.42150599 22.89949088 21.99531679
 22.05832324 12.44893691 22.94357199 22.92902216 22.91695848 22.88292591
 15.32844712 22.93064625 21.17506863 22.89701889 22.9225

In [13]:
np.savetxt('Predictions_boston.csv',y_pred, fmt = "%1.5f")