In [1]:
#package imports
from datetime import datetime
from sklearn.preprocessing import QuantileTransformer
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import joblib
from pgmpy.estimators import MaximumLikelihoodEstimator 
from pgmpy.estimators import BayesianEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
#from utils import prepare_bayesian_features

%matplotlib inline



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_og = pd.read_csv('data.csv')

In [3]:
#format sqfts and price column to make them a number
df_og['sqfts'] = pd.to_numeric(df_og['sqfts'].str.rstrip('ft2'))
df_og['price'] = df_og['price'].str.replace(',', '').str.replace('$', '').astype(float)

In [4]:
df = df_og.copy()

In [5]:
for col in df.columns:
    if df[col].mode()[0] == np.NaN:
        df[col] = df[col].fillna(df[col].mode()[1])
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

In [6]:
# Assuming 'df' is your DataFrame
# Set the maximum cap values for 'Price' and 'sqfts' columns
max_price_cap = 7000
max_sqfts_cap = 1500

# Clip the values in the 'Price' column to the maximum cap
df['price'] = df['price'].clip(upper=max_price_cap)

# Clip the values in the 'sqfts' column to the maximum cap
df['sqfts'] = df['sqfts'].clip(upper=max_sqfts_cap)

In [7]:
df.bedrooms = df.bedrooms.astype('category')
df.rename(columns={"bedrooms": "rooms"}, inplace=True)

In [8]:
min_mean_price = df.groupby('locality')['price'].mean().min()
max_mean_price = df.groupby('locality')['price'].mean().max()
print(min_mean_price)
print(max_mean_price)
df['mean_price'] = df.groupby('locality')['price'].transform('mean')

1650.0
7000.0


In [9]:
from sklearn.preprocessing import QuantileTransformer

# Assuming 'df' is your DataFrame with the 'quality' column now representing the mean prices of localities

# Create a QuantileTransformer object with output distribution as 'uniform'
transformer = QuantileTransformer(output_distribution='uniform')

# Perform Quantile Transformation on the 'quality' column
df['transformed_quality'] = transformer.fit_transform(df[['mean_price']])

# Rescale the transformed values to the desired range (e.g., [0, 5])
new_min_scale = 0
new_max_scale = 5
df['scaled_quality'] = (df['transformed_quality'] - df['transformed_quality'].min()) * (new_max_scale - new_min_scale) / (df['transformed_quality'].max() - df['transformed_quality'].min()) + new_min_scale

# Optional: If you want to round the scaled values to a specific number of decimal places
decimal_places = 2
df['quality'] = df['scaled_quality'].round(decimal_places)

# Drop the intermediate columns if not needed
df.drop(columns=['transformed_quality'], inplace=True)

In [10]:
df.head()

Unnamed: 0,price,rooms,sqfts,longitude,name,latitude,bathrooms_level_two,streetAddress,country,locality,postal,region,housing_type,links,sauna,pool,steam,mean_price,scaled_quality,quality
0,3000.0,1.0,784.0,-122.812341,White Rock Condo across from ocean,49.023295,1.0,1160 Oxford St,CA,White Rock,V4B3P5,BC,Apartment,https://vancouver.craigslist.org/rds/apa/d/whi...,False,True,True,2975.526749,0.202703,0.2
1,3495.0,2.0,1000.0,-123.032026,NORTH SHORE LIVING - Best Floor Plan - 2bd/2bt...,49.312385,2.0,1371 Blackwood Street,CA,North Vancouver,V7J2L5,BC,Apartment,https://vancouver.craigslist.org/nvn/apa/d/nor...,True,True,True,3004.69589,0.633133,0.63
2,5300.0,1.0,733.0,-123.078688,"- --(OBsold.com)-- Stunning Furnished 1bd+den,...",49.310206,1.0,1371 Blackwood Street,CA,North Vancouver,V7L0E3,BC,Apartment,https://vancouver.craigslist.org/nvn/apa/d/nor...,True,True,True,3004.69589,0.633133,0.63
3,4500.0,3.0,1500.0,-123.0979,Vancouver West Townhouse three (3) Bedroom,49.2156,4.0,1371 Blackwood Street,CA,Vancouver,V5X,BC,Apartment,https://vancouver.craigslist.org/van/apa/d/van...,False,True,True,3082.236802,2.484985,2.48
4,1900.0,1.0,1000.0,-123.055115,Apartment for rent,49.074691,1.0,Scott Road,CA,Delta,V4K3X7,BC,Apartment,https://vancouver.craigslist.org/rds/apa/d/del...,False,True,True,3138.094118,4.421922,4.42


# Modelling

In [11]:
def prepare_bayesian_features(key, value):
    if key in ["SearchRooms", "Rooms"]:
        if value <= 2:
            return 1
        elif value <= 3:
            return 2
        else:
            return 3
    elif key in ["SearchSqft", "Sqft"]:
        if value <= 600:
            return 1
        elif value <= 1100:
            return 2
        elif value <= 2000:
            return 3
        else:
            return 4
    elif key in ["SearchQuality", "Quality"]:
        if value <= 3:
            return 1
        elif value <= 4.1:
            return 2
        else:
            return 3
    elif key in ["SearchPrice", "Price"]:
        if value <= 1500:
            return 1
        elif value <= 2500:
            return 2
        elif value <= 4500:
            return 3
        else:
            return 4      
        
    return value

In [12]:
N = 8663

room_distribution_industry = [0.1, 0.15, 0.2, 0.2, 0.15, 0.1, 0.1]
price_distribution_industry = [0.05, 0.1, 0.15, 0.2, 0.2, 0.15, 0.15]
sqft_distribution_industry = [0.1, 0.15, 0.2, 0.2, 0.15, 0.1, 0.1]
quality_distribution_industry = [0.05, 0.15, 0.2, 0.2, 0.2, 0.15, 0.05]

df_model = pd.DataFrame({
    'SearchRooms': np.random.choice(range(1, 8), p=room_distribution_industry, size=N),
    'SearchSqft': np.random.choice(range(1, 8), p=sqft_distribution_industry, size=N),
    'SearchQuality': np.random.choice(range(1, 8), p=quality_distribution_industry, size=N),
    'SearchPrice': np.random.choice(range(1, 8), p=price_distribution_industry, size=N),
})


df_model["Rooms"] = df["rooms"].apply(lambda x: prepare_bayesian_features("Rooms", x))
df_model["Price"] = df["price"].apply(lambda x: prepare_bayesian_features("Price", x))
df_model["Quality"] = df["quality"].apply(lambda x: prepare_bayesian_features("Quality", x))
df_model["Sqft"] = df["sqfts"].apply(lambda x: prepare_bayesian_features("Sqft", x))

columns = ['Rooms', 'Sqft', 'Quality', 'Price']

# # Normalize all features
# for col in columns:
#     flat_col = col
#     user_col = 'Search' + col
#     df_model[flat_col] = (df_model[flat_col] - df_model[flat_col].min()) / \
#                           (df_model[flat_col].max() - df_model[flat_col].min())
#     df_model[user_col] = (df_model[user_col] - df_model[user_col].min()) / \
#                           (df_model[user_col].max() - df_model[user_col].min())
# Compute differences
differences = np.zeros(len(df_model))
for col in columns:
    flat_col = col
    user_col = 'Search' + col
    differences += abs(df_model[flat_col] - df_model[user_col])

# Scale differences to create a score between 1 and 7. We reverse it because smaller differences mean higher preference.
df_model['UserPref'] = 7 - np.ceil(6 * (differences - differences.min()) / (differences.max() - differences.min()))

# Sample industry values, replace these with your actual values
industry_average_values = {
    'IndustryRooms': 2,
    'IndustrySqft': 1200,
    'IndustryQuality': 3,
    'IndustryPrice': 3000
}

for column, value in industry_average_values.items():
    df_model[column] = value

In [13]:
df_model.head()

Unnamed: 0,SearchRooms,SearchSqft,SearchQuality,SearchPrice,Rooms,Price,Quality,Sqft,UserPref,IndustryRooms,IndustrySqft,IndustryQuality,IndustryPrice
0,4,1,2,4,1,3,1,2,5.0,2,1200,3,3000
1,1,7,5,4,1,3,1,2,4.0,2,1200,3,3000
2,1,2,4,3,1,4,1,2,5.0,2,1200,3,3000
3,2,4,6,5,2,3,1,3,4.0,2,1200,3,3000
4,5,7,1,4,1,2,3,2,3.0,2,1200,3,3000


In [14]:
# Defining the model structure.
flat_model = BayesianNetwork([
    ('SearchRooms', 'UserPref'),
    ('SearchSqft', 'UserPref'),
    ('SearchQuality', 'UserPref'),
    ('SearchPrice', 'UserPref'),
    ('IndustryRooms', 'Rooms'),
    ('IndustrySqft', 'Sqft'),
    ('IndustryQuality', 'Quality'),
    ('IndustryPrice', 'Price'),
    ('Rooms', 'UserPref'),
    ('Sqft', 'UserPref'),
    ('Quality', 'UserPref'),
    ('Price', 'UserPref')
])

mle = BayesianEstimator(flat_model, df_model)
cpds = [mle.estimate_cpd(node) for node in flat_model.nodes()]


flat_model.add_cpds(*cpds)
assert flat_model.check_model()

inference = VariableElimination(flat_model)


In [15]:
def preference_score(probabilities):
    # Exponential weights for each UserPref level from 1 to 7
    weights = [1, 2, 4, 8, 16, 32, 64]
    
    # Calculating score as a weighted average
    score = sum([prob * weight for prob, weight in zip(probabilities, weights)])
    
    return score

In [16]:
import random

# Generate random evidence scenarios
def random_evidence():
    return {
        'SearchRooms': random.randint(1, 3),
        'SearchSqft': random.randint(1, 3),
        'SearchQuality': random.randint(1, 3),
        'SearchPrice': random.randint(1, 3)
    }

# Number of scenarios you want to generate
num_scenarios = 100

# Threshold for minimum difference between scores
threshold = 0.05

previous_scores = []
distinct_evidence_sets = []

for _ in range(num_scenarios):
    evidence = random_evidence()
    result = inference.query(variables=['UserPref'], evidence=evidence)
    score = preference_score(result.values)
    
    # Check if the score is distinct enough from previous scores
    if all(abs(prev_score - score) > threshold for prev_score in previous_scores):
        distinct_evidence_sets.append(evidence)
        previous_scores.append(score)

print(len(distinct_evidence_sets))


23


# Pickle Data

In [17]:
import joblib
joblib.dump(flat_model, "bayesian_model_userpref.pkl")  

['bayesian_model_userpref.pkl']