In [2]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Import training data to pandas dataframe
df = pd.read_csv("train.csv")
# Trimming Down
df_bm = df.loc[:,['Neighborhood','GrLivArea','YearBuilt','OverallQual','CentralAir','SalePrice','FullBath','HalfBath','BsmtFullBath','BsmtHalfBath','TotalBsmtSF','BsmtUnfSF']]


In [3]:
# Creating total bathroom count TotBathroom
TotBathroom = df_bm.FullBath + 0.5 * df_bm.HalfBath \
+ df_bm.BsmtFullBath + (0.5) * df_bm.BsmtHalfBath

df_bm['TotBathroom'] = TotBathroom

# Remove the one's we don't need anymore
del df_bm['FullBath']
del df_bm['HalfBath']
del df_bm['BsmtFullBath']
del df_bm['BsmtHalfBath']

In [4]:
# Creating total square foot TotSF
TotSF = df_bm.GrLivArea + df_bm.TotalBsmtSF - df_bm.BsmtUnfSF
# Adding to dataset
df_bm['TotSF'] = TotSF

# Remove the ones we don't need anymore
del df_bm['GrLivArea']
del df_bm['TotalBsmtSF']
del df_bm['BsmtUnfSF']


In [5]:
# Data Preprocessing

# deal with missing values in textual columns 
#replace missing neighborhood with a dummy variable 
df_bm['Neighborhood'].fillna('fill')
#Replace missing ac classification with Y since more likely
#and allows us to maintain binary classification
df_bm['CentralAir'].fillna('Y')

df_bm[:5]

Unnamed: 0,Neighborhood,YearBuilt,OverallQual,CentralAir,SalePrice,TotBathroom,TotSF
0,CollgCr,2003,7,Y,208500,3.5,2416
1,Veenker,1976,6,Y,181500,2.5,2240
2,CollgCr,2001,7,Y,223500,3.5,2272
3,Crawfor,1915,7,Y,140000,2.0,1933
4,NoRidge,2000,8,Y,250000,3.5,2853


In [6]:
print(df_bm)


     Neighborhood  YearBuilt  OverallQual CentralAir  SalePrice  TotBathroom  \
0         CollgCr       2003            7          Y     208500          3.5   
1         Veenker       1976            6          Y     181500          2.5   
2         CollgCr       2001            7          Y     223500          3.5   
3         Crawfor       1915            7          Y     140000          2.0   
4         NoRidge       2000            8          Y     250000          3.5   
5         Mitchel       1993            5          Y     143000          2.5   
6         Somerst       2004            8          Y     307000          3.0   
7          NWAmes       1973            7          Y     200000          3.5   
8         OldTown       1931            7          Y     129900          2.0   
9         BrkSide       1939            5          Y     118000          2.0   
10         Sawyer       1965            5          Y     129500          2.0   
11        NridgHt       2005            

In [7]:
df_bm.columns

Index([u'Neighborhood', u'YearBuilt', u'OverallQual', u'CentralAir',
       u'SalePrice', u'TotBathroom', u'TotSF'],
      dtype='object')

# Basic Linear Regression

# Get input data

In [39]:
df_X = df_bm.loc[:,['YearBuilt', 'TotSF', 'TotBathroom']]
df_y = df_bm.loc[:, ['SalePrice']]
df_X

Unnamed: 0,YearBuilt,TotSF,TotBathroom
0,2003,2416,3.5
1,1976,2240,2.5
2,2001,2272,3.5
3,1915,1933,2.0
4,2000,2853,3.5
5,1993,2094,2.5
6,2004,3063,3.0
7,1973,2981,3.5
8,1931,1774,2.0
9,1939,1928,2.0


In [40]:
df_X.values.tolist()

[[2003.0, 2416.0, 3.5],
 [1976.0, 2240.0, 2.5],
 [2001.0, 2272.0, 3.5],
 [1915.0, 1933.0, 2.0],
 [2000.0, 2853.0, 3.5],
 [1993.0, 2094.0, 2.5],
 [2004.0, 3063.0, 3.0],
 [1973.0, 2981.0, 3.5],
 [1931.0, 1774.0, 2.0],
 [1939.0, 1928.0, 2.0],
 [1965.0, 1946.0, 2.0],
 [2005.0, 3322.0, 4.0],
 [1962.0, 1649.0, 2.0],
 [2006.0, 1494.0, 2.0],
 [1960.0, 1986.0, 2.5],
 [1929.0, 854.0, 1.0],
 [1970.0, 1582.0, 2.0],
 [1967.0, 1296.0, 2.0],
 [2004.0, 1760.0, 2.5],
 [1958.0, 1843.0, 1.0],
 [2005.0, 2376.0, 3.5],
 [1930.0, 1108.0, 1.0],
 [2002.0, 1795.0, 2.0],
 [1976.0, 1900.0, 2.0],
 [1968.0, 1916.0, 2.0],
 [2007.0, 1600.0, 2.0],
 [1951.0, 1620.0, 1.5],
 [2007.0, 2922.0, 3.0],
 [1957.0, 2877.0, 2.0],
 [1927.0, 520.0, 1.0],
 [1920.0, 1317.0, 1.0],
 [1966.0, 1228.0, 1.5],
 [2007.0, 1234.0, 2.0],
 [1959.0, 2718.0, 2.0],
 [2005.0, 2714.0, 3.0],
 [2004.0, 2452.0, 3.5],
 [1994.0, 1097.0, 1.5],
 [1954.0, 2510.0, 1.5],
 [1953.0, 1788.0, 2.0],
 [1955.0, 1152.0, 2.0],
 [1965.0, 1967.0, 2.0],
 [1959.0, 2295.0, 

In [34]:
import torch
from torch.autograd import Variable

In [41]:
X = Variable(torch.Tensor(df_X.values.tolist()))
y = Variable(torch.Tensor(df_y.values.tolist()))
X

Variable containing:
  2003.0000   2416.0000      3.5000
  1976.0000   2240.0000      2.5000
  2001.0000   2272.0000      3.5000
                 ⋮                  
  1941.0000   2615.0000      2.0000
  1950.0000   2156.0000      2.0000
  1965.0000   2376.0000      2.5000
[torch.FloatTensor of size 1460x3]

# Linear Regression

In [46]:
W = Variable(torch.ones(3, 1), requires_grad=True)
b = Variable(torch.zeros(1460, 1), requires_grad=True)
b

Variable containing:
    0
    0
    0
  ⋮   
    0
    0
    0
[torch.FloatTensor of size 1460x1]

In [50]:
predictions = X.mm(W) + b
predictions

Variable containing:
  4422.5000
  4218.5000
  4276.5000
     ⋮      
  4558.0000
  4108.0000
  4343.5000
[torch.FloatTensor of size 1460x1]

In [55]:
import torch.nn as nn

learning_rate = 1e-2
optimizer = torch.optim.Adam([W, b], lr=learning_rate)
loss_fxn = nn.MSELoss()

for t in range(500):
    predictions = X.mm(W) + b
#     loss = (predictions - y).pow(2).sum()
    if t % 25 == 0:
        print losses.data[0]
    losses = loss_fxn(predictions, y)
    optimizer.zero_grad()   
    losses.backward()
    optimizer.step()

24522346496.0
24230078464.0
23939825664.0
23652603904.0
23368341504.0
23087079424.0
22808811520.0
22533496832.0
22261088256.0
21991565312.0
21724923904.0
21461153792.0
21200154624.0
20941991936.0
20686561280.0
20433934336.0
20183996416.0
19936825344.0
19692273664.0
19450413056.0
