In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import PolynomialFeatures
from matplotlib import cm
import math
import matplotlib.patches as mpatches
import sys
import copy
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing

In [2]:
 class ScatterData:
    def __init__(self,x,y,color,label,marker,alpha):
        self.x = x
        self.y = y
        self.color = color
        self.label = label
        self.marker = marker
        self.alpha = alpha
class PlotData:
    def __init__(self,x,y,color,label,alpha):
        self.x = x
        self.y = y
        self.color = color
        self.label = label
        self.alpha = alpha
class HistogramData:
    def __init__(self,data,numBins):
        self.data = data
        self.numBins = numBins

In [3]:
 def PlotHelper(title, xLabel, yLabel, scatterData = None, plotData = None, histogramData = None):
    fig = plt.figure()
    ax= plt.axes()
    ax.set_xlabel(xLabel)
    ax.set_ylabel(yLabel)
    ax.set_title(title, loc='left')
    
    #(self,x,y,color,label,marker,alpha):
    if scatterData is not None:
        for data in scatterData:
            ax.scatter(data.x,data.y, label = data.label, color=data.color, alpha = data.alpha, marker =data.marker)
    
    if plotData is not None:
        for data in plotData:
            ax.plot(data.x,data.y, label = data.label, color=data.color, alpha = data.alpha)
    
    if histogramData is not None:
        plt.hist(histogramData.data, bins=histogramData.numBins)
            
    plt.legend()
    plt.show()

In [4]:
 def MeanSquareErrorPlot(title, xLabel, xVals, mse, std):
    fig = plt.figure() 
    ax = plt.axes()
    ax.set_title(title)
    plt.xlabel(xLabel)
    plt.ylabel("Mean Squared Error")   
    plt.errorbar(xVals, mse, yerr=std)
    plt.show()

In [5]:
class Dataset:
    def __init__(self, file,splitPercentage = .9, Debug = False):
        df = pd.read_csv(file)
        if Debug:
            print(df.head())
            print(df.info())
        
        numRowsTrain = int(splitPercentage * df.shape[0])
        numRowsValidate = df.shape[0] - numRowsTrain
        #https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
        df = shuffle(df, random_state = 0)
        df.reset_index(inplace=True, drop=True)
        self.X = df.iloc[:,1:]
        self.y = df.iloc[:,0]
        self.ColumnNames = df.columns[1:]
        
        
        self.trainX = self.X.iloc[:numRowsTrain]
        self.validateX = self.X.iloc[numRowsTrain:]
        
        
        self.trainY = self.y.iloc[:numRowsTrain]
        self.validateY = self.y.iloc[numRowsTrain:]
        
        self.xPolys = {}
        self.trainxPolys= {}
        self.validatexPolys = {}
        self.polynomialFeatureNames = {}
    def AddPolynomialFeatures(self, degree):
        pf = PolynomialFeatures(degree)
        self.xPolys[degree] = pf.fit_transform(self.X)
        self.trainxPolys[degree] = pf.fit_transform(self.trainX)
        self.validatexPolys[degree] = pf.fit_transform(self.validateX)
        self.polynomialFeatureNames[degree] = pf.get_feature_names_out(self.ColumnNames)
        
    def PrintColumns(self):
        for name in self.ColumnNames:
            print(name)

In [6]:
class MLModel:
    def __init__(self):
        self.thetas = []
        self.type = None
        self.yPred = None
        self.model = None
    def TrainModel(self, ModelType, x, y, c = None, K = None):
        assert(self.type == None and ModelType in ["Lasso", "Ridge", "KNN"])
        self.type = ModelType
        if ModelType == "Lasso":
            self.model = linear_model.Lasso(alpha=(1/(2 * c)))
        elif ModelType == "Ridge":
            self.model = linear_model.Ridge(alpha=(1/(2 * c)))
        elif ModelType == "KNN":
            self.model = KNeighborsRegressor(n_neighbors = K)
            #assert (False)
        print("Fitting " + self.type)
        self.model.fit(x, y)
        
        if ModelType in ["Lasso", "Ridge"]:
            self.thetas.append(self.model.intercept_)
            for data in self.model.coef_:
                self.thetas.append(data)

    def KFoldsValidation(self, ModelType, x, y, hyperparameter = None, folds = 5):
        kf = KFold(n_splits = folds)
        assert(self.type == None and ModelType in ["Lasso", "Ridge", "KNN"])
        self.type = ModelType
        self.meanError = []
        self.stdError = []
        # Use current polynomial features and 
        # C value to perform k folds validation
        if ModelType == "Lasso":
            self.model = linear_model.Lasso(alpha=(1/(2 * hyperparameter)))
        elif ModelType == "Ridge":
            self.model = linear_model.Ridge(alpha=(1/(2 * hyperparameter)))
        elif ModelType == "KNN":
            return self.KFoldsKNN(x,y,hyperparameter, folds)
            
        
        temp = []
        for train,test in kf.split(x):
            self.model.fit(x[train], y[train])
            yPred = self.model.predict(x[test])
            # append the F1 Score for the currently trained model
            temp.append(mean_squared_error(y[test],yPred))
        
        self.meanError = np.array(temp).mean()
        self.stdError = np.array(temp).std()
        return self.meanError, self.stdError
    
    def KFoldsKNN(self, x, y, K, folds = 5):
        kf = KFold(n_splits = folds)
        self.model = KNeighborsRegressor(n_neighbors = K)
        self.meanError = []
        self.stdError = []

        temp = []
        for train,test in kf.split(x):
            self.model.fit(x[train], y[train])
            yPred = self.model.predict(x[test])
            
            temp.append(mean_squared_error(y[test],yPred))
        
        self.meanError = np.array(temp).mean()
        self.stdError = np.array(temp).std()
        return self.meanError, self.stdError
    def Predict(self, x, y):
        assert self.type != None
        yPred = self.model.predict(x)
        mse = mean_squared_error(y,yPred)
        return mse
    
    def PrintWeights(self, names):
        assert(self.type in ["Lasso", "Ridge"])
        weights = self.thetas[1:]
        print(len(names), len(weights))
        for i in range(len(weights)):
            print(names[i], weights[i])

In [11]:
reviews = Dataset("./scrubbed-reviews.csv")
listings = Dataset("./scrubbed-listings.csv")

In [12]:
reviews.trainX

Unnamed: 0,id,reviewer_id,comments
0,478683447,124269504,We had a great time at Diogos apartment for a ...
1,634918298,225421938,Great location lovely Irish neighbours
2,212920091,79313584,Maggie was a fantastic host with a lovely home...
3,667932556757057107,459608367,Great hospitality and kindness by Eoin and her...
4,208886893,119566387,Maggies place is wonderful - a beautiful space...
...,...,...,...
218859,670889668009529373,15605934,Place was amazing really central location and ...
218860,617871418394478642,437209297,Good place with a great host Jacinta is very c...
218861,607103876366586672,11943480,With no doubt Dougs house is one of the best o...
218862,400479837,189389844,Really good spot close to restaurants and pub...


In [13]:
listings.trainX

Unnamed: 0,name,description,neighborhood_overview,host_id,host_duration_years,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,Luxury apartment in Dublin City's Smithfield,This large luxury apartment has a perfect loca...,Smithfield is a great cosmopolitian area with ...,30703628,7.739726,"Dublin, Ireland",Im a busy communications exec and enjoy living...,within a few hours,1.0,1.00,...,4.69,4.94,4.63,4.75,-1,1,1,0,0,0.28
1,Dublin City Center Luxurious Room,Secure building in a central safe area Just b...,Its a very alive area with a big square only 5...,426395000,1.230137,"Dublin, Ireland",I am Gleisson I love to travel around the worl...,within an hour,1.0,0.99,...,4.84,4.89,4.85,4.73,1,2,1,1,0,6.72
2,Cozy and perfectly located private bedroom,Beautiful cozy private room with double bed an...,The most touristic area in DublinWalking dista...,30449259,7.750685,"Dublin, Ireland",I am a 30 Yo guy Dentist from VenezuelaI used ...,,0.0,0.00,...,5.00,5.00,5.00,4.87,-1,1,0,1,0,0.27
3,Cute 1 bedroom home next to Dublin City Center,Conveniently located within 15 minutes walk of...,,29930249,7.775342,"Dublin, Ireland",,within a day,0.8,0.79,...,4.89,4.72,4.22,4.11,-1,1,1,0,0,5.81
4,A quiet village apartment close to city centre,Bright double bed guest room in a two bedroom ...,,28507828,7.841096,"Dublin, Ireland",A lover of all things outdoors a little bit of...,within an hour,1.0,1.00,...,4.95,4.89,4.84,4.74,-1,1,0,1,0,0.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5583,Double room in Blanschanstown,The spaceBig double room in a lovely house 15 ...,Quiet area,25383784,8.005479,"Dublin, Ireland",,,0.0,0.00,...,5.00,5.00,5.00,4.00,1,1,0,1,0,0.02
5584,"Ivy exchange, Parnell street, Dublin 1.",This apartment is in a quiet modern and safe c...,,118476709,5.838356,"Dublin, Ireland",,,0.0,0.00,...,4.70,4.90,5.00,4.70,1,1,1,0,0,0.21
5585,Spacious home in great location.,This is a much loved very comfortable three be...,Really family friendly area close to the city ...,28985174,7.819178,"Dublin, Ireland",We have had great experiences with Airbnb and ...,,0.0,0.00,...,5.00,5.00,5.00,4.50,-1,1,1,0,0,0.05
5586,Spacious Designer Apartment - free parking,Modern clean apartment in the lovely cultural...,The locality is well supplied with convenience...,5119026,9.871233,"Dublin, Ireland",Im from Galway in the west of Ireland but I ha...,within a few hours,1.0,0.95,...,5.00,5.00,5.00,4.92,-1,1,1,0,0,2.06
