In [2]:
import numpy as np
import pandas as pd
import cvxopt

In [3]:
# process the data, fill the nan 

data = pd.read_csv("data.csv", parse_dates=True)


# deal with non-numerical values in data

# host_since (to years)
data["host_since"] = round(((pd.to_datetime("9-17-2019") - pd.to_datetime(data["host_since"])).dt.days/365),1)
median = data["host_since"].median()
#data["host_since"].replace([np.inf, -np.inf], np.nan)
data["host_since"] = data["host_since"].fillna(median)
data["host_since"] = pd.to_numeric(data["host_since"])


# host_response_time
# within an hour - 1
# within a few hours - 2
# within a day -3
# within a few days - 4
# within a few days or more - 5
data["host_response_time"] = data["host_response_time"].map({'within an hour':1, 'within a few hours':2, 'within a day':3, 'within a few days':4, 'within a few days or more':5})
median = data["host_response_time"].median()
#data["host_response_time"].replace([np.inf, -np.inf], np.nan)
data["host_response_time"] = data["host_response_time"].fillna(median)


# host_response_rate
data['host_response_rate'] = data['host_response_rate'].str.rstrip('%').astype('float') / (100.0)
median = data['host_response_rate'].median()
data['host_response_rate'] = data['host_response_rate'].fillna(median)


# host_is_superhost
data['host_is_superhost'] = data['host_is_superhost'].map({'t':1, 'f':0})
median = data['host_is_superhost'].median()
data['host_is_superhost'] = data['host_is_superhost'].fillna(median)


# host_listings_count
median = data['host_listings_count'].median()
data['host_listings_count'] = data['host_listings_count'].fillna(median)


# host_identity_verified
data['host_identity_verified'] = data['host_identity_verified'].map({'t':1, 'f':0})
median = data['host_identity_verified'].median()
data['host_identity_verified'] = data['host_identity_verified'].fillna(median)


# zipcode (standardized by minus). Higher or lower doesn't info price. Should be mapped by average price within each area.
data['zipcode'] = pd.to_numeric(data['zipcode'])
median = data['zipcode'].median()
data['zipcode'] = data['zipcode'].fillna(median) 
mini = min(data['zipcode'])
data['zipcode'] = data['zipcode'] - mini + 1




# latitude and longitude, overlap with zipcode, so delete
data = data.drop(columns = ['latitude', 'longitude'])


# is_location_exact
data['is_location_exact'] = data['is_location_exact'].map({'t':1, 'f':0})
median = data['is_location_exact'].median()
data['is_location_exact'] = data['is_location_exact'].fillna(median)


# property_type
m = lambda x: 1 if x not in ['Apartment', 'House'] else (2 if x=='Apartment' else 3)
data['property_type'] = data['property_type'].map(m)
median = data['property_type'].median()
data['property_type'] = data['property_type'].fillna(median)


# room_type
m = lambda x: 1 if x not in ['Entire home/apt', 'Private room'] else (2 if x=='Private room' else 3)
data['room_type'] = data['room_type'].map(m)
median = data['room_type'].median()
data['room_type'] = data['room_type'].fillna(median)


# accommodates
median = data['accommodates'].median()
data['accommodates'] = data['accommodates'].fillna(median)



# bathrooms
median = data['bathrooms'].median()
data['bathrooms'] = data['bathrooms'].fillna(median)


# bedrooms
median = data['bedrooms'].median()
data['bedrooms'] = data['bedrooms'].fillna(median)


# beds
median = data['beds'].median()
data['beds'] = data['beds'].fillna(median)


# bed_type
m = lambda x: 1 if x == 'Real Bed' else 0
data['bed_type'] = data['bed_type'].map(m)
median = data['bed_type'].median()
data['bed_type'] = data['bed_type'].fillna(median)


# amenities
m = lambda x: x.count(",")+1
data['amenities'] = data['amenities'].map(m)
median = data['amenities'].median()
data['amenities'] = data['amenities'].fillna(median)


# square_feet. The valid sample size we have is too small, delete
data = data.drop(columns = ['square_feet'])


# number_of_reviews
# number_of_reviews_ltm
# may be high due to being cheap, and it varies too much. All reviews may be a result instead of a cause. The total score may be useful.

# first_review and last_review (by minus), consider omitting

# review_scores_rating

# review_scores_accuracy

# review_scores_cleanliness

# review_scores_checkin

# review_scores_communication

# review_scores_location

# review_scores_value

reviews_to_be_dropped = ['number_of_reviews','number_of_reviews_ltm','first_review','last_review','review_scores_accuracy',\
                        'review_scores_cleanliness','review_scores_checkin','review_scores_communication',\
                        'review_scores_location','review_scores_value']
data = data.drop(columns = reviews_to_be_dropped)

median = data['review_scores_rating'].median()
data['review_scores_rating'] = data['review_scores_rating'].fillna(median)


data.to_csv("adjusted_data.csv")







In [4]:


data = pd.read_csv("adjusted_data.csv")



In [5]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")

test_index = test['id'].tolist()
train_index = train['id'].tolist()
val_index = val['id'].tolist()

test_data = data[data['id'].isin(test_index)]
train_data = data[data['id'].isin(train_index)]
val_data = data[data['id'].isin(val_index)]

test_data.to_csv("test_data.csv")
train_data.to_csv("train_data.csv")
val_data.to_csv("val_data.csv")

# by now, the data has been roughly processed
##############################################################################################################################################

In [6]:
#test = pd.read_csv("test_data.csv")
train = pd.read_csv("train_data.csv")
#val = pd.read_csv("val_data.csv")

In [7]:
xt = np.array(train.iloc[:,3:-1]) #(6152, 17)
yt = np.array(train.iloc[:,-1:]) #(6152, 1)

# add column of 1
def add_column(X):
    return np.insert(X, 0, 1, axis=1) #(6152,18)

# compute h(X, theta)
def predict(X, theta):
    Xa = add_column(X)
    return Xa @ theta

# loss
def loss(X,y,theta):
    return ((predict(X,theta) - y)**2).mean()/2

theta_init = np.zeros((18,1),dtype=np.float64)

print(loss(xt,yt,theta_init))

23731.064531859556


In [8]:
#np.seterr(invalid='ignore')

def loss_gradient(X, y, theta):
    Xa = add_column(X)
    loss_grad = ((predict(X, theta) - y)*Xa).mean(axis=0)[:, np.newaxis]
    return loss_grad

def run_gd(loss, loss_gradient, X, y, theta_init, lr=0.00009, n_iter=10000):
    theta_current = theta_init.copy()
    loss_values = []
    theta_values = []
    for i in range(n_iter):
       
        loss_value = loss(X, y, theta_current)
        lg = loss_gradient(X, y, theta_current)
        theta_current = theta_current - lr*lg
       
        '''print(i) 
        print(loss_value)
        print(lg.ravel())
        print( theta_current.ravel())'''
        
        lr = (9/(i+100000))
        
        loss_values.append(loss_value)
        theta_values.append(theta_current)
    return theta_current, loss_values, theta_values

result = run_gd(loss, loss_gradient, xt, yt, theta_init)
theta_est, loss_values, theta_values = result
print('estimated theta value', theta_est.ravel())
print('resulting loss', loss(xt, yt, theta_est))

estimated theta value [ 0.11455544  4.43195368  0.15246756  0.19511724 -5.8014343   0.02921851
 -0.84567614 -1.10062616  0.52970163  3.45689608  7.63186171 18.42360776
  9.21938959 13.65540048  5.84865427  0.66377544 -1.48132326  1.58039741]
resulting loss 5842.188368382864


In [9]:
# test on validity
val = pd.read_csv("val_data.csv")

xv = np.array(val.iloc[:,3:-1]) 
yv = np.array(val.iloc[:,-1:]) 

vali_loss = loss(xv,yv,theta_est)

print(vali_loss)

vali_predict = pd.DataFrame(predict(xv, theta_est), columns = ['Prediction'])
val_data2 = pd.concat([val, vali_predict], axis=1)

val_data2.to_csv("val_data2.csv")


5687.718574872866


In [10]:
test = pd.read_csv("test_data.csv")
xte = np.array(test.iloc[:,3:-1]) 
yte = np.array(test.iloc[:,-1:]) 

test_predict = pd.DataFrame(predict(xte, theta_est), columns = ['price'])
test_output = pd.concat([test['id'], test_predict], axis = 1)

test_output.to_csv("my_prediction.csv")