In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import cm
import math
import matplotlib.patches as mpatches
import sys
import copy
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing

In [9]:
 class ScatterData:
    def __init__(self,x,y,color,label,marker,alpha):
        self.x = x
        self.y = y
        self.color = color
        self.label = label
        self.marker = marker
        self.alpha = alpha
class PlotData:
    def __init__(self,x,y,color,label,alpha):
        self.x = x
        self.y = y
        self.color = color
        self.label = label
        self.alpha = alpha
class HistogramData:
    def __init__(self,data,numBins):
        self.data = data
        self.numBins = numBins

In [10]:
 def PlotHelper(title, xLabel, yLabel, scatterData = None, plotData = None, histogramData = None):
    fig = plt.figure()
    ax= plt.axes()
    ax.set_xlabel(xLabel)
    ax.set_ylabel(yLabel)
    ax.set_title(title, loc='left')
    
    #(self,x,y,color,label,marker,alpha):
    if scatterData is not None:
        for data in scatterData:
            ax.scatter(data.x,data.y, label = data.label, color=data.color, alpha = data.alpha, marker =data.marker)
    
    if plotData is not None:
        for data in plotData:
            ax.plot(data.x,data.y, label = data.label, color=data.color, alpha = data.alpha)
    
    if histogramData is not None:
        plt.hist(histogramData.data, bins=histogramData.numBins)
            
    plt.legend()
    plt.show()

In [11]:
 def MeanSquareErrorPlot(title, xLabel, xVals, mse, std):
    fig = plt.figure() 
    ax = plt.axes()
    ax.set_title(title)
    plt.xlabel(xLabel)
    plt.ylabel("Mean Squared Error")   
    plt.errorbar(xVals, mse, yerr=std)
    plt.show()

In [12]:
class Dataset:
    def __init__(self, file,splitPercentage = .9, Debug = False):
        df = pd.read_csv(file)
        if Debug:
            print(df.head())
            print(df.info())
        
        numRowsTrain = int(splitPercentage * df.shape[0])
        numRowsValidate = df.shape[0] - numRowsTrain
        #https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
        df = shuffle(df, random_state = 0)
        df.reset_index(inplace=True, drop=True)
        self.X = df.iloc[:,1:]
        self.y = df.iloc[:,0]
        self.ColumnNames = df.columns[1:]
        # https://scikit-learn.org/stable/modules/preprocessing.html
#         scaler = preprocessing.StandardScaler().fit(self.X)
#         self.XScaled = scaler.transform(self.X)
        
        self.trainX = self.X.iloc[:numRowsTrain]
        self.validateX = self.X.iloc[numRowsTrain:]
        
#         self.trainXScaled = scaler.transform(self.trainX)
#         self.validateXScaled = scaler.transform(self.validateX)
        
        self.trainY = self.y.iloc[:numRowsTrain]
        self.validateY = self.y.iloc[numRowsTrain:]
        
        self.xPolys = {}
#         self.xPolysScaled = {}
        self.trainxPolys= {}
#         self.trainxPolysScaled = {}
        self.validatexPolys = {}
#         self.validatexPolysScaled = {}
        self.polynomialFeatureNames = {}
    def AddPolynomialFeatures(self, degree):
        pf = PolynomialFeatures(degree)
        self.xPolys[degree] = pf.fit_transform(self.X)
#         self.xPolysScaled[degree] = pf.fit_transform(self.XScaled)
        self.trainxPolys[degree] = pf.fit_transform(self.trainX)
#         self.trainxPolysScaled[degree] = pf.fit_transform(self.trainXScaled)
        self.validatexPolys[degree] = pf.fit_transform(self.validateX)
#         self.validatexPolysScaled[degree] = pf.fit_transform(self.validateXScaled)
        
        self.polynomialFeatureNames[degree] = pf.get_feature_names_out(self.ColumnNames)
    def PrintColumns(self):
        for name in self.ColumnNames:
            print(name)

In [13]:
class MLModel:
    def __init__(self):
        self.thetas = []
        self.type = None
        self.yPred = None
        self.model = None
    def TrainModel(self, ModelType, x, y, c = None, K = None):
        assert(self.type == None and ModelType in ["Lasso", "Ridge", "KNN"])
        self.type = ModelType
        if ModelType == "Lasso":
            self.model = linear_model.Lasso(alpha=(1/(2 * c)))
        elif ModelType == "Ridge":
            self.model = linear_model.Ridge(alpha=(1/(2 * c)))
        elif ModelType == "KNN":
            self.model = KNeighborsRegressor(n_neighbors = K)
            #assert (False)
        print("Fitting " + self.type)
        self.model.fit(x, y)
        
        if ModelType in ["Lasso", "Ridge"]:
            self.thetas.append(self.model.intercept_)
            for data in self.model.coef_:
                self.thetas.append(data)

    def KFoldsValidation(self, ModelType, x, y, hyperparameter = None, folds = 5):
        kf = KFold(n_splits = folds)
        assert(self.type == None and ModelType in ["Lasso", "Ridge", "KNN"])
        self.type = ModelType
        self.meanError = []
        self.stdError = []
        # Use current polynomial features and 
        # C value to perform k folds validation
        if ModelType == "Lasso":
            self.model = linear_model.Lasso(alpha=(1/(2 * hyperparameter)))
        elif ModelType == "Ridge":
            self.model = linear_model.Ridge(alpha=(1/(2 * hyperparameter)))
        elif ModelType == "KNN":
            return self.KFoldsKNN(x,y,hyperparameter, folds)
            
        
        temp = []
        for train,test in kf.split(x):
            self.model.fit(x[train], y[train])
            yPred = self.model.predict(x[test])
            # append the F1 Score for the currently trained model
            temp.append(mean_squared_error(y[test],yPred))
        
        self.meanError = np.array(temp).mean()
        self.stdError = np.array(temp).std()
        return self.meanError, self.stdError
    
    def KFoldsKNN(self, x, y, K, folds = 5):
        kf = KFold(n_splits = folds)
        self.model = KNeighborsRegressor(n_neighbors = K)
        self.meanError = []
        self.stdError = []

        temp = []
        for train,test in kf.split(x):
            self.model.fit(x[train], y[train])
            yPred = self.model.predict(x[test])
            
            temp.append(mean_squared_error(y[test],yPred))
        
        self.meanError = np.array(temp).mean()
        self.stdError = np.array(temp).std()
        return self.meanError, self.stdError
    def Predict(self, x, y):
        assert self.type != None
        yPred = self.model.predict(x)
        mse = mean_squared_error(y,yPred)
        return mse
    
    def PrintWeights(self, names):
        assert(self.type in ["Lasso", "Ridge"])
        weights = self.thetas[1:]
        print(len(names), len(weights))
        for i in range(len(weights)):
            print(names[i], weights[i])

In [16]:
reviews = Dataset("./reviews.csv")
listings = Dataset("./listings.csv")

In [17]:
reviews.trainX

Unnamed: 0,id,date,reviewer_id,reviewer_name,comments
0,478683447,2019-06-30,124269504,Martijn,We had a great time at Diogo's apartment for a...
1,634918298,2020-07-05,225421938,Edward,Great location lovely Irish neighbours
2,212920091,2017-11-19,79313584,Patrick,"Maggie was a fantastic host, with a lovely hom..."
3,667932556757057107,2022-07-10,459608367,Daniela,Great hospitality and kindness by Eoin and her...
4,208886893,2017-11-03,119566387,Kelsey,Maggie’s place is wonderful - a beautiful spac...
...,...,...,...,...,...
218859,670889668009529373,2022-07-14,15605934,Robert,"Place was amazing, really central location and..."
218860,617871418394478642,2022-05-02,437209297,Taylor,Good place with a great host. Jacinta is very ...
218861,607103876366586672,2022-04-17,11943480,Pedro,"With no doubt, Doug's house is one of the best..."
218862,400479837,2019-01-12,189389844,Peter,"Really good spot, close to restaurants and pu..."


In [18]:
listings.trainX

Unnamed: 0,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,https://www.airbnb.com/rooms/22593475,20220911231053,2022-09-12,previous scrape,Double Room in coastal town near Dublin Airport,Spacious Double Room in newly build 4 bedroom ...,,https://a0.muscache.com/pictures/a23b2fe9-bb18...,27964751,https://www.airbnb.com/users/show/27964751,...,,,,,t,1,0,1,0,
1,https://www.airbnb.com/rooms/4208603,20220911231053,2022-09-12,city scrape,Bright EnSuiteTwin Room in Baldoyle,Our bungalow is in the village & although it i...,A review from Judith July 2016 describes our a...,https://a0.muscache.com/pictures/54121236/2880...,1499994,https://www.airbnb.com/users/show/1499994,...,4.98,4.71,4.97,,f,1,0,1,0,1.26
2,https://www.airbnb.com/rooms/669480550941080498,20220911231053,2022-09-11,city scrape,"Apt: 2+1 Bed , Private entrance",LOCATION: Rathgar village. <br />* **** Plea...,,https://a0.muscache.com/pictures/miso/Hosting-...,76582779,https://www.airbnb.com/users/show/76582779,...,4.57,4.86,4.43,,f,1,1,0,0,4.67
3,https://www.airbnb.com/rooms/52449141,20220911231053,2022-09-11,city scrape,Lovely 2 bedroom condo in exclusive developmen...,The whole group will enjoy easy access to ever...,This is our family home and the first time to ...,https://a0.muscache.com/pictures/3c277bd9-d6a9...,423528681,https://www.airbnb.com/users/show/423528681,...,4.61,4.89,4.56,,f,1,1,0,0,2.84
4,https://www.airbnb.com/rooms/50750811,20220911231053,2022-09-12,city scrape,Beautiful City Apartment - Walk Everywhere !!,Perfectly located in the very heart of Dublins...,The apartment is set in the bustling & fashion...,https://a0.muscache.com/pictures/miso/Hosting-...,410135381,https://www.airbnb.com/users/show/410135381,...,4.29,4.47,4.11,,t,2,2,0,0,6.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6804,https://www.airbnb.com/rooms/9742084,20220911231053,2022-09-12,previous scrape,Ensuite Double Bedroom,The spacious double bedroom en suite is a 15 m...,,https://a0.muscache.com/pictures/cd43c01e-4e03...,2002122,https://www.airbnb.com/users/show/2002122,...,4.87,4.55,4.52,,t,1,0,1,0,0.41
6805,https://www.airbnb.com/rooms/40519127,20220911231053,2022-09-12,previous scrape,Cozy double room close to City Center 15mins,Comfort and convenience double bedroom in new ...,Nice and quiet neighborhood. Harolds Cross is ...,https://a0.muscache.com/pictures/7423c5f5-000e...,146899456,https://www.airbnb.com/users/show/146899456,...,5.00,4.63,4.63,,t,1,0,1,0,0.27
6806,https://www.airbnb.com/rooms/680987260549256607,20220911231053,2022-09-12,city scrape,One Bed Open Plan Suite at Zanzibar Locke,These 32m² apartments have an open-plan layout...,Temple Bar is just across the Ha’penny Bridge ...,https://a0.muscache.com/pictures/prohost-api/H...,371005518,https://www.airbnb.com/users/show/371005518,...,,,,,t,7,7,0,0,
6807,https://www.airbnb.com/rooms/25860792,20220911231053,2022-09-12,previous scrape,Private Double Room with ensuite in Large House,My place is close to City centre with busses a...,Easy access to city centre in direct busses wh...,https://a0.muscache.com/pictures/f597190f-3bd8...,65938332,https://www.airbnb.com/users/show/65938332,...,4.96,4.78,4.78,,t,1,0,1,0,0.44
