In [2]:
 # Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

import pickle

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.model_selection import KFold 

# Metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import accuracy_score, classification_report

# Pre-Processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


# Models
from sklearn.neighbors import NearestNeighbors

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Step 1: Load the dataset
filepath = "Resources/2024_clean_data_only.csv"

# read the CSV file
df = pd.read_csv(filepath, encoding='latin1', low_memory=False)

df.rename(columns={'id': "rental_id"}, inplace=True)
#view
df.head(10)

Unnamed: 0.1,Unnamed: 0,rental_id,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews
0,0,1189243425411300671,Manhattan,Murray Hill,Entire home/apt,58.0,30,1
1,1,651593916026998398,Brooklyn,Flatlands,Private room,80.0,30,0
2,2,310325,Manhattan,Harlem,Private room,75.0,30,31
3,3,572612125615500056,Brooklyn,Sunset Park,Private room,45.0,30,6
4,4,1020282701018874374,Brooklyn,Bedford-Stuyvesant,Private room,47.0,30,0
5,5,640443472855597440,Brooklyn,Bedford-Stuyvesant,Private room,194.0,30,0
6,6,725482656106188093,Brooklyn,Bedford-Stuyvesant,Entire home/apt,400.0,30,0
7,7,681730243323738679,Brooklyn,Bushwick,Private room,79.0,30,14
8,8,1004979479809147241,Queens,Ridgewood,Private room,45.0,30,0
9,9,969035784970111373,Brooklyn,Bedford-Stuyvesant,Entire home/apt,225.0,30,5


In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,rental_id,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews
0,1189243425411300671,Manhattan,Murray Hill,Entire home/apt,58.0,30,1
1,651593916026998398,Brooklyn,Flatlands,Private room,80.0,30,0
2,310325,Manhattan,Harlem,Private room,75.0,30,31
3,572612125615500056,Brooklyn,Sunset Park,Private room,45.0,30,6
4,1020282701018874374,Brooklyn,Bedford-Stuyvesant,Private room,47.0,30,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22790 entries, 0 to 22789
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   rental_id            22790 non-null  int64  
 1   neighbourhood_group  22790 non-null  object 
 2   neighbourhood        22790 non-null  object 
 3   room_type            22790 non-null  object 
 4   price                22790 non-null  float64
 5   minimum_nights       22790 non-null  int64  
 6   number_of_reviews    22790 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 1.2+ MB


In [6]:
# get the min and max of the prices
df.price.describe()

count     22790.000000
mean        232.891795
std        1019.738134
min           8.000000
25%          85.000000
50%         147.000000
75%         250.000000
max      100000.000000
Name: price, dtype: float64

In [7]:
def recommendation(closest_rentals_length, neighbourhood_group, room_type, price):

    final_df = df.copy()
    
    # Target column is going to be predicting the price if the other information is given.
    meta_cols = ['rental_id', 'neighbourhood_group', 'room_type']
    feature_cols = ['neighbourhood', 'price', 'minimum_nights', 'number_of_reviews']

    # Preprocessing for numeric features: fill missing values with the mean and standardize the values
    numeric_features = ['price', 'minimum_nights', 'number_of_reviews']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
        ('scaler', StandardScaler())])  # Standardize features

    
    # Preprocessing for categorical features: fill missing values and apply one-hot encoding
    categorical_features = ['neighbourhood']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', missing_values=pd.NA)),  # Fill missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])  # Apply one-hot encoding

    # Combine all preprocessing steps into a single preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),  # Numeric features preprocessing
            ('cat', categorical_transformer, categorical_features)])  # Categorical features preprocessing

    # get the target location id
    rental_id = final_df.loc[(final_df.neighbourhood_group == neighbourhood_group) & (final_df.room_type == room_type)]\
        .sort_values(by="price", ascending=False).rental_id.values[0]
    
    # Prepare the feature matrix for NN
    X = final_df.loc[:, feature_cols]

    # apply preprocessing
    preprocessor.fit(X)
    X_preprocessed = preprocessor.transform(X)

    # initialize NN
    k = closest_rentals_length
    model1 = NearestNeighbors(n_neighbors=k, metric="cosine")

    # Fit the Nearest Neighbors model to the preprocessed data
    model1.fit(X_preprocessed)

    # Step 7: Extract features of the target track
    rental_features = df.loc[df.rental_id == rental_id, feature_cols]  # Get the feature vector for the target track
    rental_features_preprocessed = preprocessor.transform(rental_features) 

    # Step 8: Find the nearest neighbors (songs most similar to the target track)
    distances, indices = model1.kneighbors(rental_features_preprocessed)  # Get distances and indices of neighbors

    # Step 9: Retrieve the metadata of the recommended tracks
    rentals = final_df.iloc[indices[0]]  # Select tracks corresponding to the nearest neighbors
    rentals["distance"] = distances[0]  # Add the distance of each neighbor as a new column

    # Step 10: Filter the columns for the final output
    cols = rentals.columns # you can explicitly choose to return specific columns here
    rentals = rentals.loc[:, cols]  # Keep the relevant columns
    rentals = rentals.sort_values(by="distance")  # Sort the rentals by their distance (most similar first)

    # Step 11: Return the recommended tracks as a list of dictionaries
    return rentals.to_dict(orient="records")

In [8]:
# User input closest_rentals_length, neighbourhood_group, room_type, price
closest_rentals_length = 5
neighbourhood_group = "Brooklyn"
room_type = "Entire home/apt"
price = 100

response = recommendation(closest_rentals_length, neighbourhood_group, room_type, price) # used for the flask app, returns JSON

# to test
pd.DataFrame(response)

Unnamed: 0,rental_id,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,distance
0,1208640642572493071,Brooklyn,Crown Heights,Entire home/apt,8403.0,30,0,1.110223e-16
1,1176853764246777285,Brooklyn,Crown Heights,Private room,6000.0,30,0,0.001585478
2,605115521796576121,Brooklyn,Bedford-Stuyvesant,Private room,100000.0,30,9,0.009353484
3,17160286,Brooklyn,Bedford-Stuyvesant,Private room,100000.0,30,29,0.009512927
4,1004416200240866150,Manhattan,Upper East Side,Entire home/apt,15618.0,31,0,0.01022399


In [9]:
# pickle it!
with open('df.pkl', 'wb') as file:
     pickle.dump(df, file)

In [10]:
pd.__version__

'2.2.2'

In [11]:
import numpy as np

np.__version__

'1.26.4'