# OpenLab 2.2 Regression

In this Openlab we are going to **predict the price** of airbnbs based on a set of given features.

Steps:
1. Clean the data: transform properly the string data to *float*;
2. Split the dataset into *train* and *test*;
3. Normalize the numeric values using the *StandardScaler*;
4. Train a *LinearRegressionWithSGD* model using the training set;
5. Evaluate the prediction of model for the test set computing the *Mean Square Error*;

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.mllib.regression import LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils

import matplotlib.pyplot as plt
import numpy as np


In [2]:
# create a spark context
conf = SparkConf().setAppName("pre-processing").setMaster("local[*]")
sc = SparkContext(conf=conf)

# load the dataset
server = "11"
rdd_airbnbs = sc.textFile("hdfs://kddrtserver{0}.isti.cnr.it:9000/hpsa/datasets/venice_airbnb_regression.csv".format(server))
rdd_airbnbs.count()

7696

In [3]:
rdd_airbnbs.take(2)

['id;price;accommodates;bathrooms;bedrooms;beds;review_scores_rating;review_scores_accuracy;review_scores_cleanliness;review_scores_checkin;review_scores_communication;review_scores_location;review_scores_value;reviews_per_month',
 '6623;$225.00;4;2.0;2.0;4.0;99.0;10.0;10.0;10.0;10.0;10.0;10.0;1.3']

### Step 1. Pre-Processing

In [4]:
# clean the line
def parseLine(line):
    return [float(x.replace("$", "").replace(",","")) for x in line.split(';')]

# remove the header
header = rdd_airbnbs.first()

# preprocessing
cleaned_rdd = rdd_airbnbs.filter(lambda l: l != header) \
                  .map(parseLine)


### Step 2. Split the dataset into train and test

In [5]:
# sample the train set
rdd_train = cleaned_rdd.sample(withReplacement=False, fraction=0.7, seed=100)

# obtain the test set
train_ids = rdd_train.map(lambda x: x[0]).collect()
rdd_test = cleaned_rdd.filter(lambda x: x[0] not in train_ids)

# print
print(rdd_test.take(5))

[[6623.0, 225.0, 4.0, 2.0, 2.0, 4.0, 99.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 1.3], [44527.0, 300.0, 6.0, 2.0, 3.0, 3.0, 96.0, 10.0, 9.0, 10.0, 10.0, 10.0, 9.0, 0.67], [44998.0, 200.0, 6.0, 2.0, 2.0, 4.0, 60.0, 6.0, 7.0, 7.0, 6.0, 9.0, 5.0, 0.05], [45036.0, 120.0, 2.0, 1.0, 1.0, 1.0, 78.0, 8.0, 8.0, 8.0, 9.0, 9.0, 7.0, 0.17], [46158.0, 200.0, 2.0, 1.0, 1.0, 1.0, 73.0, 7.0, 8.0, 8.0, 7.0, 9.0, 7.0, 0.21]]


In [6]:
# check if the split went well
assert rdd_train.count() + rdd_test.count() == rdd_airbnbs.count() - 1, "Wrong split!" # minus 1 because of the header

### Step 3. Normalize

In [7]:
from pyspark.mllib.feature import StandardScaler

# normalize using the mean and the std
scaler = StandardScaler(withMean=True, withStd=True)
std_scaler = scaler.fit(rdd_train)
rdd_train_norm = std_scaler.transform(rdd_train)
rdd_test_norm = std_scaler.transform(rdd_test)

### Step 4. Train the Model

In [9]:
# adapt the model
def getVectors(v):
    # ignoring the id in the first tuple value
    return LabeledPoint(v[1], v[2:len(v)])


# adapting the train and test vectors for the model  
rdd_train_vectors = rdd_train_norm.map(lambda x: getVectors(x))
rdd_test_vectors = rdd_test_norm.map(lambda x: getVectors(x))


In [10]:
# train the model
model = LinearRegressionWithSGD.train(rdd_train_vectors, iterations=50)

In [11]:
# coeficients of the model
model.weights

DenseVector([0.2086, 0.0577, 0.2889, -0.0661, 0.1684, -0.0568, 0.0379, -0.0104, -0.0311, 0.094, -0.1075, -0.1299])

### Step 5. Evaluate the Model

In [12]:
# training set
train_target_predic = rdd_train_vectors.map(lambda x: (x.label, model.predict(x.features)))
SE = train_target_predic.map(lambda vp: (vp[0] - vp[1])**2)
MSE = SE.reduce(lambda x, y: x + y) / train_target_predic.count()

print("Evaluation - Training")
print("Mean Squared Error = " + str(MSE))

Evaluation - Training
Mean Squared Error = 0.7562160161873945


In [13]:
# test set
test_target_predic = rdd_test_vectors.map(lambda x: (x.label, model.predict(x.features)))
SE = test_target_predic.map(lambda vp: (vp[0] - vp[1])**2)
MSE = SE.reduce(lambda x, y: x + y) / test_target_predic.count()

print("Evaluation - Test")
print("Mean Squared Error = " + str(MSE))

Evaluation - Test
Mean Squared Error = 3.479052207082139
