In [1]:
#import relevant packages 
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor, LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import itertools
import joblib
import json
import os

import matplotlib.pyplot as plt # we only need pyplot
import numpy as np
import pandas as pd
import seaborn as sb


sb.set() # set the default Seaborn style for graphics

In [2]:
import plotly.graph_objects as go
import plotly.express as px
import scipy.stats as stats
from IPython.display import display, HTML

In [3]:
# load dataset
df = pd.read_csv('listing.csv')

In [4]:
# Function to create scrollable table within a small window
def create_scrollable_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [5]:
df.shape

(988, 20)

In [6]:
def remove_rows_with_missing_ratings(df):
       
       df = df.dropna(subset=['Location_rating'])
       return  df

df = remove_rows_with_missing_ratings(df)

In [7]:
def combine_description_strings(df):

        df['Description'].str.replace("'About this space',","")
        df['Description'] = df['Description'].str.replace('"',"")
        df['Description'] = df['Description'].str.replace("' '","")
        df['Description'] = df['Description'].str.replace("'","")
        df['Description'] = df['Description'].str.replace(r'\\n', '')
        df['Description'] = df['Description'].str.replace(r'\\n\\', '')
        df['Description'] = df['Description'].str.split(',', n=1).str[-1]
        df['Description'] = df['Description'].str[:-2]
        df= df.dropna(subset=['Description'])
        return df
df = combine_description_strings(df)

In [9]:
def set_default_feature_values(df):

        df['beds'] = df['beds'].fillna(1)
        df['guests'] = df['guests'].fillna(1)
        df['bathrooms'] = df['bathrooms'].fillna(1)
        df['bedrooms'] = df['bedrooms'].fillna(1)
        return df
    
def clean_tabular_data(df):
    df = set_default_feature_values(df)
    df = remove_rows_with_missing_ratings(df)
    df = combine_description_strings(df)
    try:
        df.drop('Unnamed: 19', axis=1, inplace=True)
    except:
        pass
    return df


df = set_default_feature_values(df)
df = clean_tabular_data(df)
# df.to_csv('clean_tabular_data.csv', index=False)


In [13]:
numerical_features = df.select_dtypes(include=[np.number])
numerical_features.describe()

Unnamed: 0,beds,bathrooms,Price_Night,Cleanliness_rating,Accuracy_rating,Communication_rating,Location_rating,Check-in_rating,Value_rating,amenities_count
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,2.425301,1.376506,153.93494,5.088916,4.905904,4.933494,4.90506,4.944458,4.769277,35.451446
std,1.844651,0.827813,129.179626,6.776068,0.130051,0.121753,0.120707,0.104018,0.174995,14.256733
min,1.0,0.0,3.0,3.8,4.0,3.9,4.0,3.9,3.7,3.0
25%,1.0,1.0,83.0,4.8,4.9,4.9,4.9,4.9,4.7,25.0
50%,2.0,1.0,120.0,4.9,4.9,5.0,4.9,5.0,4.8,35.0
75%,3.0,1.5,176.0,5.0,5.0,5.0,5.0,5.0,4.9,44.0
max,17.0,10.0,1132.0,200.0,5.0,5.0,5.0,5.0,5.0,84.0


In [14]:
# Summary statistics for numerical dataf
numerical_data = df.select_dtypes(include=[np.number])
summary_stats = numerical_data.describe().T
html_numerical = create_scrollable_table(summary_stats, 'numerical_data', 'Summary statistics for numerical data')

display(HTML(html_numerical))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
beds,830.0,2.425301,1.844651,1.0,1.0,2.0,3.0,17.0
bathrooms,830.0,1.376506,0.827813,0.0,1.0,1.0,1.5,10.0
Price_Night,830.0,153.93494,129.179626,3.0,83.0,120.0,176.0,1132.0
Cleanliness_rating,830.0,5.088916,6.776068,3.8,4.8,4.9,5.0,200.0
Accuracy_rating,830.0,4.905904,0.130051,4.0,4.9,4.9,5.0,5.0
Communication_rating,830.0,4.933494,0.121753,3.9,4.9,5.0,5.0,5.0
Location_rating,830.0,4.90506,0.120707,4.0,4.9,4.9,5.0,5.0
Check-in_rating,830.0,4.944458,0.104018,3.9,4.9,5.0,5.0,5.0
Value_rating,830.0,4.769277,0.174995,3.7,4.7,4.8,4.9,5.0
amenities_count,830.0,35.451446,14.256733,3.0,25.0,35.0,44.0,84.0


First, we model based on numerical data.
We will create a features array that contains all the columns that are numerical.

Then we extract the price per night as the label array for predictions/tests.

In [15]:
# Separate the features (input) and labels (output)
features = numerical_data.drop('Price_Night', axis=1)  # Exclude the 'Price_Night' column as the label
labels = numerical_data['Price_Night']

# Return the features and labels as a tuple
features_labels_tuple = (features, labels)


**Stage 3: Modelling and Prediction**
Data is ready for the training of models

We will use scikit-learn modules to perform our modelling.

The goal is to generate models that will best predict the price per night based on which features the model deems to be of high importance/influence.

In [16]:
# Differentiation of features and labels
X = features
y = labels
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = list(X.columns))
# Splitting into train and test sets
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42 )

In [17]:
null_values = df.isnull().sum()
html_null_values = create_scrollable_table(null_values.to_frame(), 'null_values', 'Null values in the dataset')

# Percentage of missing values for each feature
missing_percentage = (df.isnull().sum() / len(df)) * 100
html_missing_percentage = create_scrollable_table(missing_percentage.to_frame(), 'missing_percentage', 'Percentage of missing values for each feature')

display(HTML(html_null_values + html_missing_percentage))

Unnamed: 0,0
ID,0
Category,0
Title,0
Description,0
Amenities,0
Location,0
guests,0
beds,0
bathrooms,0
Price_Night,0

Unnamed: 0,0
ID,0.0
Category,0.0
Title,0.0
Description,0.0
Amenities,0.0
Location,0.0
guests,0.0
beds,0.0
bathrooms,0.0
Price_Night,0.0


**Supervised Machine Learning Models**
Model 1 - Linear Regression

A simple model based on an equation of Regression Problem : Price = a * (Predictor Variables) + b

In [18]:
# Creating and fitting the model
model= LinearRegression()
model.fit(X_train, y_train)
print('Intercept of Regression \t: b  = ', model.intercept_)



Intercept of Regression 	: b  =  153.2855121913216


In [19]:
# Predict Response corresponding to Predictors
# Make predictions on the test data
y_pred = model.predict(X_test)
train_pred = model.predict(X_train)



# Compute RMSE for training and test sets
rmse_train = np.sqrt(mean_squared_error(y_train, train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))

# Compute R^2 for training and test sets
r2_train = r2_score(y_train, train_pred)
r2_test = r2_score(y_test, y_pred)

# Print the performance measures
print("RMSE - Training set:", rmse_train)
print("RMSE - Test set:", rmse_test)
print("R^2 - Training set:", r2_train)
print("R^2 - Test set:", r2_test)



# Evaluate the model
mse = mean_squared_error(y_test, y_pred)


# Print the evaluation metrics
print("Mean Squared Error test set:", mse)




RMSE - Training set: 101.32260787150916
RMSE - Test set: 108.15972971500096
R^2 - Training set: 0.35761566914283993
R^2 - Test set: 0.39235477272030017
Mean Squared Error test set: 11698.527132022064


In [20]:


def custom_tune_regression_model_hyperparameters(model_dict, hyperparameters, data):
    """Performs a grid search over a specified range of hyperparameter values for multiple models.

    Args:
        model_dict (dict): A dictionary mapping model names to their respective classes
        hyperparameters (dict): A dictionary of hyperparameter names mapping to a list of values to be tried
        data (list): The training, validation, and test datasets

    Returns:
        A dictionary mapping model names to their best MSE values
    """
    best_mse_dict = {}

    for model_name, model_class in model_dict.items():
        best_mse = float('inf')
        best_hyperparameters = {}

        for param_values in itertools.product(*hyperparameters.values()):
            params = dict(zip(hyperparameters.keys(), param_values))

            # Create a model instance with the current hyperparameter values
            model = model_class(**params)

            # Fit the model on the training data
            model.fit(data[0], data[2])

            # Predict on the validation data
            y_pred = model.predict(data[1])

            # Calculate the mean squared error
            mse = mean_squared_error(data[3], y_pred)

            # Check if the current hyperparameters result in a better MSE
            if mse < best_mse:
                best_mse = mse
                best_hyperparameters = params

        # Store the best MSE for the current model
        best_mse_dict[model_name] = best_mse

        # Print the best hyperparameters and MSE for the current model
        print("Model:", model_name)
        print("Best Hyperparameters:", best_hyperparameters)
        print("Best MSE:", best_mse)
        print()

    return best_mse_dict
grid_dic =  {'alpha':[0.001,0.01,0.1,1],
              'max_iter':[1000,5000,10000,50000],
              'random_state': [1],
              'tol':[0.1,0.01,0.001,0.001]}
data = X_train, X_test, y_train, y_test
model_class = { 'SGDRegressor':SGDRegressor,
                'LassoRegression': Lasso,
                'RidgeRegression': Ridge}


custom_tune_regression_model_hyperparameters(model_class, grid_dic, data)

Model: SGDRegressor
Best Hyperparameters: {'alpha': 0.01, 'max_iter': 1000, 'random_state': 1, 'tol': 0.1}
Best MSE: 11473.845737280835

Model: LassoRegression
Best Hyperparameters: {'alpha': 1, 'max_iter': 1000, 'random_state': 1, 'tol': 0.1}
Best MSE: 11651.177778670162

Model: RidgeRegression
Best Hyperparameters: {'alpha': 1, 'max_iter': 1000, 'random_state': 1, 'tol': 0.1}
Best MSE: 11698.145363127878



{'SGDRegressor': 11473.845737280835,
 'LassoRegression': 11651.177778670162,
 'RidgeRegression': 11698.145363127878}

from our results we can see that SGDRegressor has the lowest MSE and the best hyperparametsers for it are: {'alpha': 0.01, 'max_iter': 1000, 'random_state': 1, 'tol': 0.1}

In [21]:


def tune_regression_model_hyperparameters(model, hyperparameters, data):
    """
    This function takes in a regression model class, training, testing and validation datasets, and a dictionary of hyperparameters to tune. It then uses GridSearchCV to find the best hyperparameters for the model, and returns the best model, the best hyperparameters, and performance metrics (validation and test rmse, r2, and mae).

    Parameters:
        model_dict (dict): A dict of possible models
        data: having the four split training and valid data set
        hyperparameters (dict): The hyperparameters to be tested by GridSearchCV.

    Outputs:
        best_model: an instance of the model_class, with the best hyperparameters found
        best_hyperparameters: a dictionary of the best hyperparameters found
        performance_metrics: a dictionary of performance metrics (validation and test rmse, r2, and mae)
    """
    best_mse = float('inf')
    best_model = None
    best_hyperparameters = {}
    
    model = model()
    

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(model, hyperparameters, scoring=['neg_mean_squared_error', 'r2'], cv=5, refit='r2')
    grid_search.fit(data[0], data[2])

    # Get the best model and its hyperparameters
    
    best_model = grid_search.best_estimator_
    best_hyperparameters = grid_search.best_params_
    best_score = grid_search.best_score_

    # Predict on the validation data
    y_pred = best_model.predict(data[1])
    
    # Calculate the mean squared error
    mse = mean_squared_error(data[3], y_pred)
    
    # Check if the current model and hyperparameters result in a better MSE
    if mse < best_mse:
        best_mse = mse
    # puting the best models and metrics in a dictionary

    performance_metrics_dict={}
    performance_metrics_dict["mse"] = mse
    performance_metrics_dict["rmse"] = mse**(1/2.0)
    performance_metrics_dict["r2"] = best_score

    return model.__class__.__name__, best_hyperparameters, performance_metrics_dict


    

grid_dic =  {'alpha':[0.001,0.01,0.1,1],
              'max_iter':[1000,5000,10000,50000],
              'tol':[0.1,0.01,0.001,0.001]}
data = X_train, X_test, y_train, y_test
model = Ridge
model_class = {'LassoRegression': Lasso,
                'RidgeRegression': Ridge,
                'SGDRegressor':SGDRegressor}
tune_regression_model_hyperparameters(model, grid_dic, data)

('Ridge',
 {'alpha': 1, 'max_iter': 1000, 'tol': 0.1},
 {'mse': 11698.145363127878,
  'rmse': 108.15796486217683,
  'r2': 0.04572327204431277})

In [22]:
def save_model(model, hyperparameters, metrics, folder):
    """This function saves a trained model, its associated hyperparameters and performance metrics to a specified folder.
    Parameters:
        model: Machine learning model name
        hyperparameters: A dictionary of the best hyperparameters used to train the model
        metrics: A dictionary of the performance metrics of the model on test and validation sets
        model_folder: A string specifying the directory path where the model and associated files will be saved."""
    
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Save the trained model
    model_path = os.path.join(folder, "model.joblib")
    joblib.dump(model, model_path)

    # Save the hyperparameters as a JSON file
    hyperparameters_path = os.path.join(folder, "hyperparameters.json")
    with open(hyperparameters_path, "w") as f:
        json.dump(hyperparameters, f)

    # Save the performance metrics as a JSON file
    metrics_path = os.path.join(folder, "metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(metrics, f) 
        
    print ('Model is saved')

model = SGDRegressor()
hyperparameters = {'alpha': 0.001, 'max_iter': 5000, 'tol': 0.001}
metrics = {'mse': 11874.628398645742, 'rmse': 108.97076855123002, 'r2': 0.30426989032666596}
save_model(model, hyperparameters, metrics, folder="./models/regression/linear_regression/sgd")

Model is saved


THE SGDREGRESSOR WHICH WAS OUR BEST MODEL WAS SAVED IN THE PREVIOUS CODE USING THE SAVE MDL FUNCTION

In [23]:
def evaluate_all_models(models_dict, hyperparameters, data):
    """This function Improves the performance of the model by using different models provided by sklearn.
such as decision trees, random forests, and gradient boosting  and then saving the best models,
 hyperparameters and performance metrics to specific folder.
    
    Outputs:
        It saves the best models, hyperparameters and performance metrics of all evaluated models to specific folder."""
   
    best_model_name, best_hyperparameters, performance_metrics_dict = tune_regression_model_hyperparameters(models_dict, hyperparameters, data)
    folder_path = f'./models/regression/linear_regression/{best_model_name}'
    save_model(best_model_name, best_hyperparameters, performance_metrics_dict, folder_path) 

# run the code on selected params
decision_tree_hyperparameters = {'max_depth': [10, 20, 50], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 3, 5, 7], 'splitter': ['best', 'random']}
random_forest_hyperparameters = {'n_estimators': [50, 100, 150], 'max_depth': [10,20,50], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 3, 5, 7]}
gradient_boost_hyperparameters = {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 0.001, 0.0001], 'criterion': ['friedman_mse', 'squared_error'], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 3, 5, 7]}
dtr = DecisionTreeRegressor
rfr = RandomForestRegressor
gbr = GradientBoostingRegressor
evaluate_all_models(gbr, gradient_boost_hyperparameters, data)

Model is saved


In [24]:
def find_best_model(models):
    """This function compares the Root Mean Squared Error (RMSE) of the trained models on validation set and returns the model with the lowest RMSE.
    Parameters:
        None
    Outputs:
        Prints the model name with the lowest RMSE
    """
    best_model = None
    best_rmse = float('inf')
    best_r2 = 0
    for model in models:
        with open(f'./models/regression/linear_regression/{model}/metrics.json') as f: 
            metrics = json.load(f)
            validation_r2 = metrics['r2']
            validation_rmse = metrics['rmse']
            validation_mae = metrics['mse']
            print(f'{model}: RMSE: {validation_rmse}')

            if validation_rmse < best_rmse:
                best_rmse = validation_rmse
                best_model = model

    
    return f'The model with the lowest RMSE is: {best_model}'
models = ['sgd', 'DecisionTreeRegressor', 'RandomForestRegressor', 'GradientBoostingRegressor']
find_best_model(models)

sgd: RMSE: 108.97076855123002
DecisionTreeRegressor: RMSE: 118.0366327776913
RandomForestRegressor: RMSE: 111.32954559037628
GradientBoostingRegressor: RMSE: 116.54049151050316


'The model with the lowest RMSE is: sgd'

CLASSIFICATION MODELS

In [25]:
def class_load(df, column_list, label):
        class_features = df.drop(columns= column_list)
        class_labels = df[label]
        class_loaded_data = (features,labels)
        return class_loaded_data



In [26]:
def split_prepare(df, column_list, label):
    """Split data into train, test, and validation sets, with normalization. The labels are also encoded.

    Parameters:
        X (Matrix): Features
        y (Vector): Labels
  """
        
    # Load the dataset X, y
    
    X, y = class_load(df, column_list, label)  
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)


    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Preprocess the features (e.g., scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, y_train, X_test_scaled, y_test

    


In [27]:
# to get the data we would for classification model
   
column_list = ['ID', 'Category', 'Title', 'Description', 'Amenities', 'Location', 'url']
class_data =split_prepare(df, column_list, 'Category')

In [38]:
def log_regression(data):
    

    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(data[0], data[1])

    # Make predictions on the test set
    y_test_pred = model.predict(data[2])

  # Test set evaluation metrics
    print(f'Test Accuracy Score: {accuracy_score(data[3], y_test_pred)}')
    print(f'Test Precision Score: {precision_score(data[3], y_test_pred, average="micro")}')
    print(f'Test Recall Score: {recall_score(data[3], y_test_pred, average="micro")}')    
    print(f'Test F1 Score: {f1_score(data[3], y_test_pred, average="micro")}')

log_regression(class_data)

Test Accuracy Score: 0.006024096385542169
Test Precision Score: 0.006024096385542169
Test Recall Score: 0.006024096385542169
Test F1 Score: 0.006024096385542169


In [56]:
def tune_classification_model_hyperparameters(model_class, data, hyperparameters):
    """
    This function performs hyperparameter tuning for a classification model and returns the best model, its hyperparameters, and its performance metrics on a validation set.
    
    Parameters:
    model_class (class), data: X, y values, hyperparameters (dict).
    
    Returns:
    best_model (scikit-learn classifier instance), best_hyperparameters (dict), performance_metrics (dict)
    """
    
    performance_metrics = {}
    # k_folds = 3
    
    # used stratified kfold because of the warning of target having classes with only one memeber indicating an imbalaced dataset 
    # stratified_cv = StratifiedKFold(n_splits=k_folds, shuffle=False, random_state= None)
    grid_search = GridSearchCV(model_class, hyperparameters, scoring = 'accuracy', refit= True) 
    grid_search.fit(data[0], data[1])

    best_model = grid_search.best_estimator_
    model_name = model_class.__class__.__name__
    best_hyperparameters = grid_search.best_params_

    # Provides Validation Metrics
    y_test_pred = best_model.predict(data[2])
    
    
    # Accuracy score the same as F1 error in this multiclass classification - precision and recall calculated using weighted average as micro would return the same score as the accuracy
    y_test_accuracy = accuracy_score(data[3], y_test_pred)
    y_test_precision = precision_score(data[3], y_test_pred, average='weighted')
    y_test_recall = recall_score(data[3], y_test_pred, average='macro')

    # Maps metrics to the performance metrics dict
    performance_metrics['test_accuracy'] = y_test_accuracy
    performance_metrics['test_precision'] = y_test_precision
    performance_metrics['test_recall'] = y_test_recall

    return model_name, best_model, best_hyperparameters, performance_metrics



In [49]:
def evaluate_class_models(models_dict, hyperparameters, data):
    model_name, best_model, best_hyperparameters, performance_metrics_dict = tune_classification_model_hyperparameters(models_dict, data, hyperparameters)
    folder_path = f'./models/regression/logistic_regression/{model_name}'
    save_model(model_name, best_hyperparameters, performance_metrics_dict, folder_path)
    


In [64]:

gbr = GradientBoostingClassifier()
# evaluate_class_models(gbr, gradient_boosting_classifier_hyperparameters, class_data)

In [63]:
def find_best_class_model(models): 
    """This function compares the F1 error score of the trained models on validation set and returns the model with the lowest F1 score.
    Parameters:
        None
    Outputs:
        Prints the model name with the lowest F1 score
    """  
    best_model = None
    best_accuracy_score= 0.0
    
    for model in models:
        with open(f'./models/regression/logistic_regression/{model}/metrics.json') as f: 
            metrics = json.load(f)
            validation_accuracy = metrics['test_accuracy']
            validation_recall = metrics['test_recall']
            validation_precision = metrics['test_precision']
            print(f'{model}: accuracy {validation_accuracy}')

            if validation_accuracy > best_accuracy_score:
                best_accuracy_score = validation_accuracy
                best_model = model

    return print(f'The model with the lowest F1 Score is: {best_model}')

class_models = ['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier']
find_best_class_model(class_models)


GradientBoostingClassifier: accuracy 0.012048192771084338
The model with the lowest F1 Score is: GradientBoostingClassifier
