# Make necessary imports (purpose in comment)

In [28]:
import numpy as np
import pandas as pd # DATA IMPORT, SPLIT DATA OBJ BY COLS INTO LABELS AND FEATURES
from sklearn.model_selection import train_test_split #SPLIT DATA OBJ BY ROWS INTO TRAIN AND TEST

import torch # PACK DATA INTO TENSORS, BUILD AND TRAIN NN MODEL
from torch.utils.data import DataLoader,TensorDataset # PACK DATA INTO TENSORS

In [2]:
datafile_path = "*********/boston_housing.csv"

In [4]:
headers = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
           'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df=pd.read_csv(datafile_path,sep='\s+',names=headers)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


# Get df info: shape, col names, col types, isnull's

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


# Split 'vertically' into labels and features

In [6]:
y=df.iloc[:,-1] #LABELS, LAST COLUMN
X= df.iloc[:,:len(df.columns)-1] #FEATURES, ALL COLS BUT LAST

# Split 'horizontally' into train and test subsets

## Split

In [168]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=10)
X_train.shape, X_train.head()

((404, 13),
          CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD    TAX  \
 50    0.08873  21.0   5.64     0  0.439  5.963   45.7  6.8147    4  243.0   
 367  13.52220   0.0  18.10     0  0.631  3.863  100.0  1.5106   24  666.0   
 34    1.61282   0.0   8.14     0  0.538  6.096   96.9  3.7598    4  307.0   
 78    0.05646   0.0  12.83     0  0.437  6.232   53.7  5.0141    5  398.0   
 172   0.13914   0.0   4.05     0  0.510  5.572   88.5  2.5961    5  296.0   
 
      PTRATIO       B  LSTAT  
 50      16.8  395.56  13.45  
 367     20.2  131.42  13.33  
 34      21.0  248.31  20.34  
 78      18.7  386.40  12.34  
 172     16.6  396.90  14.69  )

## Check split

In [169]:
y_train.shape, y_train.head()

((404,),
 50     19.7
 367    23.1
 34     13.5
 78     21.2
 172    23.1
 Name: MEDV, dtype: float64)

In [170]:
X_test.shape, y_test.shape,

((102, 13), (102,))

# Fit & transform: fit scaler on train set, then transform both train & test sets

In [200]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# Pack train and test into pytorch tensors

In [203]:
# X_train_tsr = torch.FloatTensor(X_train.values)
X_train_tsr = torch.FloatTensor(X_train)
print("X_train_tsr:", X_train_tsr.shape)
# X_test_tsr = torch.FloatTensor(X_test.values)
X_test_tsr = torch.FloatTensor(X_test)
print("X_test_tsr:",X_test_tsr.shape)
y_train_tsr = torch.FloatTensor(y_train.to_numpy())
print("y_train_tsr:",y_train_tsr.shape)
y_test_tsr = torch.LongTensor(y_test.to_numpy())
print("y_test_tsr:",y_test_tsr.shape)

X_train_tsr: torch.Size([404, 13])
X_test_tsr: torch.Size([102, 13])
y_train_tsr: torch.Size([404])
y_test_tsr: torch.Size([102])


In [204]:
batch_size = 10
dataset_train = TensorDataset(X_train_tsr,y_train_tsr)
data_iter = DataLoader(dataset_train,batch_size, shuffle=True)

# Build model, 1 hidden layer

In [212]:
model = torch.nn.Sequential(torch.nn.Linear(X.shape[1],1))
model

Sequential(
  (0): Linear(in_features=13, out_features=1, bias=True)
)

In [213]:
loss = torch.nn.MSELoss(reduction='mean')

In [214]:
lr = 0.001

In [215]:
trainer = torch.optim.SGD(model.parameters(), lr = lr)
# trainer = torch.optim.Adam(model.parameters(),lr = lr)

In [216]:
dummy = torch.randn((100,13))

In [217]:
model(dummy).squeeze().shape

torch.Size([100])

In [222]:
y_train_tsr.shape

torch.Size([404])

In [220]:
num_epochs = 200
for epoch in range(num_epochs):
    for X_batch,y_batch in data_iter:
        trainer.zero_grad()
        l = loss(torch.squeeze(model(X_batch)),y_batch)
        l.backward()
        trainer.step()
#     trainer.zero_grad()
    l_train = loss(model(X_train_tsr).squeeze(),y_train_tsr)
    l_test = loss(model(X_test_tsr).squeeze(),y_test_tsr)
#     l.backward()
#     trainer.step()
    if (epoch+1) % 5 == 0:
        print('epoch %d, loss in training %f, loss in test %f' % (epoch+1, l_train.item(), l_test.item()))
#         break

epoch 5, loss in training 116.013863, loss in test 147.636658
epoch 10, loss in training 62.933708, loss in test 85.167480
epoch 15, loss in training 39.630531, loss in test 57.361946
epoch 20, loss in training 29.264416, loss in test 44.884914
epoch 25, loss in training 24.589104, loss in test 39.294277
epoch 30, loss in training 22.409182, loss in test 36.552082
epoch 35, loss in training 21.360434, loss in test 35.245716
epoch 40, loss in training 20.829239, loss in test 34.620380
epoch 45, loss in training 20.525345, loss in test 34.308525
epoch 50, loss in training 20.334917, loss in test 34.116718
epoch 55, loss in training 20.205219, loss in test 34.113537
epoch 60, loss in training 20.104776, loss in test 34.043327
epoch 65, loss in training 20.019791, loss in test 34.033440
epoch 70, loss in training 19.948524, loss in test 34.079418
epoch 75, loss in training 19.881357, loss in test 34.038944
epoch 80, loss in training 19.821751, loss in test 34.021511
epoch 85, loss in train