#### import needed libraries

In [347]:
import pandas as pd
from datetime import datetime
import numpy as np

#### batch gradient descent algorithm

In [348]:
def gradient_train(X, y, lr, stopping_threshold, itr=float("inf")):
    
    counter = 0
    weight = np.zeros(X.shape[1])
    loss_list = [np.power(X.dot(weight) - y, 2).mean()]
    
    while counter < itr:
        delta_w = X.multiply((X.dot(weight) - y), axis=0).mean() * 2
        weight = weight - lr * delta_w
        loss = np.power(X.dot(weight) - y, 2).mean()
        loss_list.append(loss)
        
        if np.linalg.norm(delta_w) <= stopping_threshold or loss == np.nan or loss == float("inf"):
            break
            
        counter += 1
        
    print("itr={}, lr={}, loss={}".format(counter, lr, loss))
    return weight, loss_list

#### load in the training data

In [349]:
train_data = pd.read_csv("ia1/PA1_train1.csv")

In [350]:
train_data.head(5)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
0,3066410850,7/9/2014,4,2.5,2720,10006,2.0,0,0,3,...,2720,0,1989,0,98074,47.6295,-122.042,2720,10759,5.9495
1,9345400350,7/18/2014,2,2.5,2600,5000,1.0,0,0,5,...,1300,1300,1926,0,98126,47.5806,-122.379,2260,5000,6.65
2,7128300060,7/7/2014,5,1.75,1650,3000,1.5,0,0,3,...,1650,0,1902,0,98144,47.5955,-122.306,1740,4000,4.43
3,2155500030,4/28/2015,4,1.75,1720,9600,1.0,0,0,4,...,1720,0,1969,0,98059,47.4764,-122.155,1660,10720,3.8
4,3999300080,9/4/2014,6,2.25,3830,11180,1.0,0,2,5,...,2440,1390,1962,0,98008,47.5849,-122.113,2500,10400,8.87


### Create new features out of the dataset

##### Do one-hot encoding of the zipcode column

In [351]:
train_zipcode = pd.get_dummies(train_data['zipcode'])

##### Drop id, sq_ft_living15 and zipcode columns

In [352]:
train_data = train_data.drop(['id', 'sqft_living15', 'zipcode'], axis=1)

##### make a new feature column representing the amount of unused residential space

In [353]:
train_data['unused_space'] = train_data['sqft_lot'] - train_data['sqft_living']

##### make a new feature column to represent how long the residence has been built

In [354]:
train_data['date'] = pd.to_datetime(train_data['date'])
train_data.loc[train_data['yr_renovated'] == 0, 'days_since_built'] = (datetime.today() - train_data['date']).dt.days
train_data.loc[train_data['yr_renovated'] != 0, 'days_since_built'] = (datetime.today() - pd.to_datetime(train_data['yr_renovated'])).dt.days

##### make a column feature to represent the geographical location of the house

In [355]:
lat_mean = train_data['lat'].mean()
long_mean = train_data['long'].mean()
train_data.loc[(train_data['lat'] > lat_mean) & (train_data['long'] > long_mean), 'northeast'] = 1
train_data.loc[(train_data['lat'] > lat_mean) & (train_data['long'] <= long_mean), 'northwest'] = 1
train_data.loc[(train_data['lat'] <= lat_mean) & (train_data['long'] > long_mean), 'southeast'] = 1
train_data.loc[(train_data['lat'] <= lat_mean) & (train_data['long'] <= long_mean), 'southwest'] = 1

##### create a new feature to represent if residential space is the largest in its neighborhood of 15

In [356]:
train_data.loc[train_data['sqft_lot'] > train_data['sqft_lot15'], 'biggest_in_neighborhood'] = 1

### Do other feature engineering

In [357]:
train_data['year'] = train_data['date'].dt.year
train_data['month'] = train_data['date'].dt.month
train_data['day'] = train_data['date'].dt.day
train_data = train_data.drop('date', axis=1)

train_data['dummy'] = np.ones(train_data.shape[0])

train_data.loc[train_data['yr_renovated'] == 0, 'age_since_renovated'] = train_data['year'] - train_data['yr_built']
train_data.loc[train_data['yr_renovated'] != 0, 'age_since_renovated'] = train_data['year'] - train_data['yr_renovated']

In [358]:
train_data['price']

0       5.9495
1       6.6500
2       4.4300
3       3.8000
4       8.8700
         ...  
9995    3.9000
9996    1.3500
9997    4.2000
9998    7.7000
9999    2.8500
Name: price, Length: 10000, dtype: float64

#### Data normalization and fill nas

In [359]:
column_mean = [0] * train_data.shape[1]
column_std = [0] * train_data.shape[1]

train_data = train_data.fillna(0)

columns = ['waterfront', 'dummy', 'northeast', 'southeast', 'northwest', 'southwest', 'biggest_in_neighborhood']
for index, column in enumerate(train_data.drop('price', axis=1).columns):
    if column in columns:
        continue
    column_mean[index] = train_data[column].mean()
    column_std[index] = train_data[column].std()
    train_data[column] = (train_data[column] - column_mean[index]) / column_std[index]
    
train_data = pd.concat([train_data, train_zipcode], axis=1)

#### train the batch gradient algorithm on our training data

In [361]:
weight, mse = gradient_train(train_data.drop('price', axis=1), train_data['price'], 0.1, 1e-7, 10000)

itr=10000, lr=0.1, loss=2.4150828220602585


#### retrain the model using only features with positive weights

In [362]:
print(weight)

bedrooms      -0.193361
bathrooms      0.208352
sqft_living    0.843519
sqft_lot       0.068581
floors        -0.249106
                 ...   
98177         -1.380808
98178         -1.067465
98188         -1.020521
98198         -1.165842
98199          0.843458
Length: 98, dtype: float64


In [363]:
dc = dict(weight)

most_imp_feature = []
for key in dc:
    if dc[key] < 0:
        continue
    most_imp_feature.append(key)
    
most_imp_feature.append("price")

In [364]:
#weight, mse = gradient_train(train_data[most_imp_feature].drop('price', axis=1), train_data[most_imp_feature]['price'], 0.1, 1e-7, 10000)

#### Load in the test data

In [365]:
test_data = pd.read_csv("ia1/PA1_test1.csv")
submission = test_data['id']

#### Do the same feature engineering done on the train data also on the test data

In [366]:
# one-hot encoding of test data
test_zipcode = pd.get_dummies(test_data['zipcode'])

# drop id, sqft_living and zipcode columns
test_data = test_data.drop(['id', 'sqft_living15', 'zipcode'], axis=1)

# ununsed residential space
test_data['unused_space'] = test_data['sqft_lot'] - test_data['sqft_living']

# how long the residence has been built
test_data['date'] = pd.to_datetime(test_data['date'])
test_data.loc[test_data['yr_renovated'] == 0, 'days_since_built'] = (datetime.today() - test_data['date']).dt.days
test_data.loc[test_data['yr_renovated'] != 0, 'days_since_built'] = (datetime.today() - pd.to_datetime(test_data['yr_renovated'])).dt.days

# geographical location of the house
test_data.loc[(test_data['lat'] > lat_mean) & (test_data['long'] > long_mean), 'northeast'] = 1
test_data.loc[(test_data['lat'] > lat_mean) & (test_data['long'] <= long_mean), 'northwest'] = 1
test_data.loc[(test_data['lat'] <= lat_mean) & (test_data['long'] > long_mean), 'southeast'] = 1
test_data.loc[(test_data['lat'] <= lat_mean) & (test_data['long'] <= long_mean), 'southwest'] = 1

#largest in the neighborhood
test_data.loc[test_data['sqft_lot'] > test_data['sqft_lot15'], 'biggest_in_neighborhood'] = 1

# other feature engineering
test_data['date'] = pd.to_datetime(test_data['date'])
test_data['year'] = test_data['date'].dt.year
test_data['month'] = test_data['date'].dt.month
test_data['day'] = test_data['date'].dt.day
test_data = test_data.drop('date', axis=1)

test_data['dummy'] = np.ones(test_data.shape[0])

test_data.loc[test_data['yr_renovated'] == 0, 'age_since_renovated'] = test_data['year'] - test_data['yr_built']
test_data.loc[test_data['yr_renovated'] != 0, 'age_since_renovated'] = test_data['year'] - test_data['yr_renovated']

test_data = test_data.fillna(0)

# normalize the test data
for index, column in enumerate(test_data.columns):
    if column in columns:
        continue
    test_data[column] = (test_data[column] - column_mean[index]) / column_std[index]
        
# join the modified zipcode df to the rest of the test data
test_data = pd.concat([test_data, test_zipcode], axis=1)

# # filter the test dataset using only the most important features
# size = len(most_imp_feature)
# test_data = test_data[most_imp_feature[:size-1]]

In [367]:
print(train_data.shape)
print(test_data.shape)

(10000, 99)
(5583, 98)


#### predict price with the trained weights from our batch gradient algorithm

In [368]:
predict_price = test_data.dot(weight)

In [369]:
submission = pd.DataFrame({'id': submission, 'price': predict_price})
submission.to_csv('submission.csv', index=False)