In [48]:
import pandas as pd # dataframe manipulation
import numpy as np # linear algebra
from sentence_transformers import SentenceTransformer


In [49]:
df = pd.read_csv("../../cleaned_data_v2.csv")


In [50]:
# bedrooms to 3 categories as >4 , <3 and others
df['bedrooms_cat'] = df['bedrooms'].apply(lambda x: '>4' if x>4 else '<3' if x<3 else '3-4')
df['bedrooms_cat'].value_counts()


bedrooms_cat
3-4    16706
<3      2972
>4      1935
Name: count, dtype: int64

In [51]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront',
       'view', 'condition', 'yr_built', 'yr_renovated', 'zipcode', 'lat',
       'long', 'nearest_station_distance_km', 'bathroom_category', 'density',
       'commute_time', 'distance_to_point_km', 'is_near_shore', 'grade_living',
       'bedrooms_cat'],
      dtype='object')

In [52]:
# create categories from lot size by dividing it into 4 categories
df['lot_size_category'] = pd.qcut(df['sqft_lot'], q=3, labels=False)
df['grade_living_category'] = pd.qcut(df['grade_living'], q=3, labels=False)

In [None]:
# def compile_text(x):
#     text =  f"""Bedrooms: {x['bedrooms_cat']}, 
#                 Lot Size (sqft): {x['lot_size_category']}, 
#                 Floors: {x['floors']}, 
#                 View: {x['view']}, 
#                 Condition: {x['condition']}, 
#                 Bathroom Category: {x['bathroom_category']}, 
#                 Near Shore: {x['is_near_shore']}, 
#                 Grade Living: {x['grade_living_category']}
#             """
#     return text

In [53]:
def compile_text(x):
    text =  f"""Bedrooms: {x['bedrooms_cat']}, 
                Lot Size (sqft): {x['lot_size_category']}, 
                Floors: {x['floors']}, 
                View: {x['view']}, 
                Condition: {x['condition']}, 
                Bathroom Category: {x['bathroom_category']}, 
                Near Shore: {x['is_near_shore']}, 
                Grade Living: {x['grade_living_category']}
            """
    return text

In [54]:
sentences = df.apply(lambda x: compile_text(x), axis=1).tolist()

In [55]:
model = SentenceTransformer(r"sentence-transformers/paraphrase-MiniLM-L6-v2")



In [56]:
output = model.encode(sentences=sentences, show_progress_bar= True, normalize_embeddings  = True)

Batches: 100%|██████████| 676/676 [00:19<00:00, 34.06it/s]


In [57]:
df_embedding = pd.DataFrame(output)
df_embedding

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,0.083415,-0.043842,0.030130,0.042729,-0.087052,-0.036448,-0.080543,-0.029080,0.021026,0.042861,...,0.021386,0.044720,-0.022564,0.014286,0.047197,0.014295,0.039348,0.000774,-0.092719,0.009645
1,0.103902,-0.019842,0.025979,0.044678,-0.076424,-0.049235,-0.048746,-0.036134,0.000537,0.017253,...,0.002660,0.030928,-0.009383,0.031572,0.050133,0.002436,0.050265,0.019036,-0.117506,0.041689
2,0.066573,-0.023088,0.028709,0.045089,-0.077509,-0.037669,-0.077752,-0.020398,0.022131,0.043428,...,0.001752,0.031420,-0.012794,0.024941,0.040449,0.021398,0.065275,0.006967,-0.102013,0.005966
3,0.109714,-0.018992,0.033080,0.056286,-0.056981,-0.046393,-0.049905,-0.042970,0.002292,0.015858,...,0.003203,0.030105,-0.002118,0.012862,0.065224,-0.003033,0.050646,0.015464,-0.105850,0.031129
4,0.094758,-0.020122,0.026913,0.045099,-0.072525,-0.047713,-0.036134,-0.031723,0.009729,0.017037,...,-0.000315,0.037115,-0.007067,0.034082,0.053776,0.003627,0.049419,0.015587,-0.109404,0.043676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,0.091082,-0.029066,0.032570,0.043765,-0.070951,-0.046719,-0.055408,-0.042804,0.003559,0.016659,...,0.005334,0.030368,-0.010702,0.035044,0.053588,0.006285,0.062793,0.008726,-0.103009,0.043091
21609,0.096709,-0.022656,0.031218,0.046665,-0.075145,-0.047758,-0.058296,-0.041461,0.000136,0.017653,...,0.004957,0.027127,-0.013237,0.033311,0.052301,0.006108,0.055514,0.012874,-0.113357,0.038299
21610,0.056375,-0.027348,0.029423,0.043862,-0.078838,-0.039539,-0.083118,-0.021382,0.024164,0.045222,...,0.007548,0.030881,-0.016934,0.025283,0.048411,0.021165,0.061967,0.006160,-0.098468,0.002626
21611,0.093237,-0.025465,0.031772,0.045884,-0.071689,-0.046368,-0.054520,-0.040836,0.003308,0.015996,...,0.003893,0.028596,-0.011686,0.035536,0.051982,0.006051,0.057983,0.009918,-0.107818,0.040868


In [58]:
df_embedding.to_csv("embedding_train.csv",index = False)
