In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("appartments.csv").drop(22)

In [3]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [4]:
df.shape

(246, 7)

In [5]:
df.columns

Index(['PropertyName', 'PropertySubName', 'NearbyLocations',
       'LocationAdvantages', 'Link', 'PriceDetails', 'TopFacilities'],
      dtype='object')

In [6]:
df.iloc[1].NearbyLocations

"['DPSG Palam Vihar Gurugram', 'The NorthCap University', 'Park Hospital, Palam Vihar', 'Pacific D21 Mall', 'Palam Vihar Halt Railway Station']"

In [7]:
df.iloc[1].LocationAdvantages	

"{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The NorthCap University': '4.4 Km', 'Park Hospital, Palam Vihar': '1.4 Km', 'Pacific D21 Mall': '8.2 Km', 'Palam Vihar Halt Railway Station': '1.2 Km', 'Dwarka Sector 21 Metro Station': '8.1 Km', 'Dwarka Expressway': '450 m', 'Fun N Food Water Park': '8.1 Km', 'Indira Gandhi International Airport': '14.1 Km', 'Tau DeviLal Sports Complex': '11.2 Km', 'Hamoni Golf Camp': '5 Km', 'Hyatt Place': '6.1 Km', 'Altrade Business Centre': '11.2 Km'}"

In [8]:
df.iloc[1].PriceDetails

"{'3 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '1,605 - 2,170 sq.ft.', 'price-range': '₹ 2.2 - 3.03 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '2,248 - 2,670 sq.ft.', 'price-range': '₹ 3.08 - 3.73 Cr'}}"

In [9]:
df.iloc[1].TopFacilities

"['Bowling Alley', 'Mini Theatre', 'Manicured Garden', 'Swimming Pool', 'Flower Garden', 'Reading Lounge', 'Golf Course', 'Barbecue', 'Sauna']"

# Top Facilities recommender system code

In [10]:
# Convert intno simple list
def extract_list(s):
    return re.findall(r"'(.*?)'",s)

df['TopFacilities'] = df['TopFacilities'].apply(extract_list)

In [11]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Salon, Restaurant, Spa, Cafete..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","[Bowling Alley, Mini Theatre, Manicured Garden..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"[Terrace Garden, Gazebo, Fountain, Amphitheatr..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Volley Ball Court, Aerobics Ce..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"[Mini Theatre, Doctor on Call, Concierge Servi..."


In [12]:
# Convert list into string
df['FacilitiesStr'] = df['TopFacilities'].apply(' '.join)

In [13]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities,FacilitiesStr
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Salon, Restaurant, Spa, Cafete...",Swimming Pool Salon Restaurant Spa Cafeteria S...
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","[Bowling Alley, Mini Theatre, Manicured Garden...",Bowling Alley Mini Theatre Manicured Garden Sw...
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"[Terrace Garden, Gazebo, Fountain, Amphitheatr...",Terrace Garden Gazebo Fountain Amphitheatre Pa...
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","[Swimming Pool, Volley Ball Court, Aerobics Ce...",Swimming Pool Volley Ball Court Aerobics Centr...
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"[Mini Theatre, Doctor on Call, Concierge Servi...",Mini Theatre Doctor on Call Concierge Service ...


#### TfidfVectorizer works to convert text data into numerical data so, that we can use input for machine learning algorithms

In [14]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2))

In [15]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['FacilitiesStr'])

In [16]:
tfidf_matrix.toarray().shape

(246, 953)

In [17]:
cosine_sim1 = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [18]:
cosine_sim1.shape

(246, 246)

In [19]:
cosine_sim1

array([[1.        , 0.01095159, 0.        , ..., 0.01183329, 0.08656385,
        0.0110727 ],
       [0.01095159, 1.        , 0.01982121, ..., 0.11904241, 0.01555534,
        0.00963852],
       [0.        , 0.01982121, 1.        , ..., 0.07020502, 0.03820314,
        0.01962826],
       ...,
       [0.01183329, 0.11904241, 0.07020502, ..., 1.        , 0.09825738,
        0.03255851],
       [0.08656385, 0.01555534, 0.03820314, ..., 0.09825738, 1.        ,
        0.06257614],
       [0.0110727 , 0.00963852, 0.01962826, ..., 0.03255851, 0.06257614,
        1.        ]])

In [20]:
def recommend_properties(property_name):

    # Get index of the property that matches the name
    idx = df.index[df['PropertyName'] == property_name].tolist()[0]

    # Get pairwise similarity scores with that property
    sim_scores = list(enumerate(cosine_sim1[idx]))

    # Sort the properties based on the similarity scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # Get the scores of the 10 most similar properties 
    sim_scores = sim_scores[1:6]

    # Get the property indicies 
    property_indicies = [i[0] for i in sim_scores]

    recommendations_df = pd.DataFrame({
        'PropertyName' : df['PropertyName'].iloc[property_indicies],
        'SimilarityScore' : sim_scores
    })

    # Return the top 10 most similar properties 
    return recommendations_df

In [21]:
recommend_properties("DLF The Arbour")

Unnamed: 0,PropertyName,SimilarityScore
64,Ace Palm Floors,"(63, 0.4529382062441955)"
217,Yashika 104,"(216, 0.4199606322926784)"
93,JMS The Nation,"(92, 0.4166584649363288)"
154,India Rashtra,"(153, 0.398954234680194)"
0,Smartworld One DXP,"(0, 0.38885046199432893)"


# Price details recommender system code 

In [22]:
import pandas as pd
import json

# Load dataset
appartments_df  = pd.read_csv('appartments.csv').drop(22)

# Function to parse and extract the required features from the PriceDetails column 
def refined_parse_modified(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for i, j in details.items():
        
        # Extract building type
        extracted[f'building type_{i}'] = j.get('building_type')

        # Parsing area details
        area = j.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {i}'] = value
                extracted[f'area high {i}'] = value
            except:
                extracted[f'area low {i}'] = None
                extracted[f'area high {i}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {i}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {i}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {i}'] = None
                extracted[f'area high {i}'] = None


        # Parsing price details
        price_range = j.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {i}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {i}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {i}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {i}'] /= 100
            except:
                extracted[f'price low {i}'] = None
                extracted[f'price high {i}'] = None

    return extracted

# Apply the refined parsing and generate the new DataFrame structure 
data_refined = []

for _, row in appartments_df.iterrows():
    features = refined_parse_modified(row['PriceDetails'])

    # Construct a new row for the transformed dataframe 
    new_row = {'PropertyName' : row['PropertyName']}

    # Populate the new row with extracted features 
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')
    data_refined.append(new_row)

df_final_refined_v2 = pd.DataFrame(data_refined).set_index('PropertyName')

In [23]:
df_final_refined_v2['building type_Land'] = df_final_refined_v2['building type_Land'].replace({'':'Land'})

In [24]:
df_final_refined_v2.columns

Index(['building type_1 BHK', 'area low 1 BHK', 'area high 1 BHK',
       'price low 1 BHK', 'price high 1 BHK', 'building type_2 BHK',
       'area low 2 BHK', 'area high 2 BHK', 'price low 2 BHK',
       'price high 2 BHK', 'building type_3 BHK', 'area low 3 BHK',
       'area high 3 BHK', 'price low 3 BHK', 'price high 3 BHK',
       'building type_4 BHK', 'area low 4 BHK', 'area high 4 BHK',
       'price low 4 BHK', 'price high 4 BHK', 'building type_5 BHK',
       'area low 5 BHK', 'area high 5 BHK', 'price low 5 BHK',
       'price high 5 BHK', 'building type_6 BHK', 'area low 6 BHK',
       'area high 6 BHK', 'price low 6 BHK', 'price high 6 BHK',
       'building type_1 RK', 'area low 1 RK', 'area high 1 RK',
       'price low 1 RK', 'price high 1 RK', 'building type_Land',
       'area low Land', 'area high Land', 'price low Land', 'price high Land'],
      dtype='object')

In [25]:
categorical_columns = df_final_refined_v2.select_dtypes(include = ['object']).columns.tolist()

In [26]:
categorical_columns

['building type_1 BHK',
 'building type_2 BHK',
 'building type_3 BHK',
 'building type_4 BHK',
 'building type_5 BHK',
 'building type_6 BHK',
 'building type_1 RK',
 'building type_Land']

In [27]:
# Apply OneHot Encoding on categorical data 
OHE_df = pd.get_dummies(df_final_refined_v2, columns = categorical_columns, drop_first = True, dtype = int)

In [28]:
OHE_df.fillna(0, inplace = True)

In [29]:
OHE_df

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,0.0,0.0,0.00,0.0000,1370.0,1370.0,2.0000,2.40,1850.0,2050.0,...,0,0,0,0,0,0,0,0,0,0
M3M Crown,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1605.0,2170.0,...,0,0,0,0,0,0,0,0,0,0
Adani Brahma Samsara Vilasa,0.0,0.0,0.00,0.0000,0.0,0.0,0.0000,0.00,1800.0,3150.0,...,0,0,1,0,0,1,0,0,0,0
Sobha City,0.0,0.0,0.00,0.0000,1381.0,1692.0,1.5500,3.21,1711.0,2343.0,...,0,0,0,0,0,0,0,0,0,0
Signature Global City 93,0.0,0.0,0.00,0.0000,981.0,1118.0,0.9301,1.06,1235.0,1530.0,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,0.0,0.0,0.00,0.0000,964.0,964.0,0.0000,0.00,1127.0,1127.0,...,0,0,0,0,0,0,0,0,0,0
Pyramid Urban Homes 2,335.0,398.0,23.45,0.2786,500.0,625.0,0.0000,0.00,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Satya The Hermitage,0.0,0.0,0.00,0.0000,1450.0,1450.0,0.0000,0.00,1991.0,1991.0,...,0,0,0,0,0,0,0,0,0,0
BPTP Spacio,0.0,0.0,0.00,0.0000,1000.0,1079.0,0.0000,0.00,1225.0,1865.0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

OHE_df_normalized = pd.DataFrame(scaler.fit_transform(OHE_df), columns = OHE_df.columns, index = OHE_df.index)

In [31]:
OHE_df_normalized.head()

Unnamed: 0_level_0,area low 1 BHK,area high 1 BHK,price low 1 BHK,price high 1 BHK,area low 2 BHK,area high 2 BHK,price low 2 BHK,price high 2 BHK,area low 3 BHK,area high 3 BHK,...,building type_2 BHK_Independent Floor,building type_2 BHK_Service Apartment,building type_3 BHK_Independent Floor,building type_3 BHK_Service Apartment,building type_3 BHK_Villa,building type_4 BHK_Independent Floor,building type_4 BHK_Villa,building type_5 BHK_Independent Floor,building type_5 BHK_Villa,building type_6 BHK_Villa
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-0.252266,-0.169584,-0.105197,-0.082332,1.223499,1.020101,-0.173712,1.158423,0.553787,0.370864,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
M3M Crown,-0.252266,-0.169584,-0.105197,-0.082332,-0.893541,-0.89666,-0.283546,-0.387986,0.293086,0.472749,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
Adani Brahma Samsara Vilasa,-0.252266,-0.169584,-0.105197,-0.082332,-0.893541,-0.89666,-0.283546,-0.387986,0.500583,1.304803,...,-0.28931,-0.063888,2.683282,-0.063888,-0.171139,3.924283,-0.236208,-0.111111,-0.216353,-0.063888
Sobha City,-0.252266,-0.169584,-0.105197,-0.082332,1.240497,1.47061,-0.198425,1.680336,0.405879,0.619632,...,-0.28931,-0.063888,-0.372678,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888
Signature Global City 93,-0.252266,-0.169584,-0.105197,-0.082332,0.622383,0.667529,-0.232468,0.295011,-0.100626,-0.070634,...,3.456497,-0.063888,2.683282,-0.063888,-0.171139,-0.254824,-0.236208,-0.111111,-0.216353,-0.063888


In [32]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(OHE_df_normalized)

In [33]:
cosine_sim2.shape

(246, 246)

In [34]:
def recommend_properties_with_scores(property_name, top_n = 247):

    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim2[OHE_df_normalized.index.get_loc(property_name)]))

    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # Get the indices and scores of thee top_n most similar properties 
    top_indices = [i[0] for i in sorted_scores[1: top_n + 1]]
    top_scores = [i[1] for i in sorted_scores[1: top_n + 1]]

    # Retrieve the names of the top properties using the indices 
    top_properties = OHE_df_normalized.index[top_indices].tolist()

    # Create a dataframe with the result 
    recommendations_df = pd.DataFrame({
        'PropertyName' : top_properties,
        'SimilarityScore' : top_scores
    })

    return recommendations_df


recommend_properties_with_scores('M3M Golf Hills')

Unnamed: 0,PropertyName,SimilarityScore
0,AIPL The Peaceful Homes,0.955462
1,Smartworld One DXP,0.954670
2,Unitech Escape,0.953092
3,M3M Capital,0.951156
4,BPTP Terra,0.943128
...,...,...
240,Golden Park,-0.522391
241,Satya Merano Greens,-0.523660
242,ROF Normanton Park,-0.525129
243,BPTP Green Oaks,-0.525286


## LocationAdvantage recommender system code 

In [35]:
import pandas as pd
import ast
from fuzzywuzzy import fuzz
from sklearn.preprocessing import StandardScaler



In [36]:
# Convert meter to kilometer 
def distance_to_meter(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None

    except:
        return None
        

In [37]:
# Define a mapping for standardizing location names
location_mapping = {
    'Dwarka Expy': 'Dwarka Expressway',
    'Dwarka Express Way': 'Dwarka Expressway',
    'Northern Peripheral Road': 'Dwarka Expressway',
    'Indira Gandhi Intl Airport': 'IGI Airport',
    'Indira Gandhi International Airport': 'IGI Airport',
    'Delhi International Airport': 'IGI Airport',
    'Golf Course Ext Rd': 'Golf Course Extension Road',
    'Golf Course Ext Road': 'Golf Course Extension Road',
    'Badshahpur Sohna Rd Hwy': 'Sohna Road',
    'Sohna Rd': 'Sohna Road',
    'NH 48': 'Delhi Jaipur Expressway',
    'Delhi - Jaipur Expressway': 'Delhi Jaipur Expressway',
    'NH8': 'Delhi Jaipur Expressway',
    'Gurgaon Railway Station': 'Gurugram Railway Station',
    'Sector 55-56 Metro Station': 'Sector 55-56 Metro Station',
    'Sector 55/56 Metro Station': 'Sector 55-56 Metro Station',
    'Pacific D21 Mall': 'Pacific D21 Mall',
    'Pacific Outlet Mall': 'Pacific D21 Mall',
    'The NorthCap University': 'NorthCap University',
    'Northcap University': 'NorthCap University',
}

In [38]:
# Function to standardize location names
def standardize_location_name(location, mapping, threshold=85):
    if location in mapping:
        return mapping[location]
    # Fuzzy match to find the closest standardized name
    for standard_name, mapped_name in mapping.items():
        if fuzz.ratio(location.lower(), standard_name.lower()) > threshold:
            return mapped_name
    return location

In [39]:
# Collect all unique location names 
all_locations = set()
for index, row in df.iterrows():
    location_dict = ast.literal_eval(row['LocationAdvantages'])
    all_locations.update(location_dict.keys())

In [40]:
# Extract distance for each location with standardized names
location_matrix = {}
for index, row in df.iterrows():
    distances = {}
    location_dict = ast.literal_eval(row['LocationAdvantages'])
    for location, distance in location_dict.items():
        standardized_location = standardize_location_name(location, location_mapping)
        distances[standardized_location] = distance_to_meter(distance)
    location_matrix[index] = distances


# Convert dictionary to DataFrame
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

In [41]:
location_df.index = df.PropertyName

In [42]:
# Fill NaN values with the maximum value for each column
location_df.fillna(25440, inplace = True)

In [43]:
# Replace 25440.0  with random values between 1000m and 25000m
def fill_random_distances(val):
    if val == 25440.0:
        return np.round(np.random.uniform(10550, 25000), 1)
    return val

# Apply to entire DataFrame (excluding index)
location_df = location_df.applymap(fill_random_distances)

  location_df = location_df.applymap(fill_random_distances)


In [44]:
location_df

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurugram Railway Station,NorthCap University,Dwarka Expressway,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,20427.1,24488.6,18621.0,12666.4,11022.5,16540.4,16180.0,23952.2,19266.1,11990.1
M3M Crown,550.0,11466.2,14937.4,17890.0,21305.9,6700.0,3800.0,15396.7,15624.3,7500.0,...,18173.5,16092.7,22228.6,24974.4,19464.7,21413.3,11373.2,17748.4,17047.5,13214.4
Adani Brahma Samsara Vilasa,5300.0,20602.8,15061.7,24005.0,2500.0,8800.0,700.0,22365.5,12090.8,12437.5,...,15360.8,12780.9,12449.1,11501.1,13535.0,14440.7,12876.0,17965.1,14864.0,23246.8
Sobha City,1500.0,19546.3,23675.3,13116.7,6500.0,6700.0,5100.0,15219.0,18910.0,8200.0,...,22363.6,22075.7,20559.6,24159.0,11287.1,19151.0,15260.4,10905.6,23123.4,19731.1
Signature Global City 93,19518.1,22592.3,14020.1,5500.0,18842.3,15382.2,20437.9,18704.2,12395.1,19485.9,...,11572.7,15632.3,16347.2,13602.1,16979.2,20541.7,23499.4,14364.7,16724.4,23316.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,13615.6,22885.8,18597.0,22304.2,15808.7,16684.5,16271.3,11181.7,21284.7,20480.8,...,11227.2,20559.2,15721.4,22449.8,20709.5,13068.4,19832.2,20912.5,23829.8,13621.4
Pyramid Urban Homes 2,16809.3,13027.5,18382.7,11988.3,18679.9,20906.1,21792.6,16458.9,23489.4,13501.1,...,10766.4,14543.2,12967.5,21107.4,23707.6,17524.5,21098.3,12490.2,15960.1,12690.9
Satya The Hermitage,18521.8,24813.7,13207.5,20318.8,16629.9,14379.7,17090.2,15693.2,14598.7,15873.0,...,17047.3,19595.4,16282.8,13720.1,15384.7,13568.7,20813.2,16095.2,23391.8,20531.1
BPTP Spacio,24813.9,12174.0,11894.7,15020.5,10564.2,21493.3,17376.8,15968.9,15541.1,23039.1,...,12993.7,11213.7,14417.1,13693.5,23284.9,15645.7,22097.5,18715.8,21732.2,21578.9


In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns = location_df.columns, index = location_df.index)

In [46]:
location_df_normalized.head()

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurugram Railway Station,NorthCap University,Dwarka Expressway,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,-3.702162,-3.490526,-3.346868,-2.425671,-2.156429,-2.281414,-1.707223,-2.160322,-2.42308,-2.390939,...,0.605222,1.517591,0.233413,-1.186159,-1.624721,-0.27576,-0.3484,1.55191,0.258513,-1.379573
M3M Crown,-3.756371,-1.430131,-0.654992,0.229898,0.882727,-2.022801,-1.358371,-0.456049,-0.475594,-2.367406,...,0.057257,-0.376492,1.1127,1.642631,0.388239,0.858898,-1.491441,0.012468,-0.240288,-1.092189
Adani Brahma Samsara Vilasa,-2.726406,0.669421,-0.626726,1.327856,-2.601023,-1.605041,-1.77431,1.087047,-1.29245,-1.205496,...,-0.626653,-1.123622,-1.270876,-1.453984,-1.02564,-0.764677,-1.13408,0.066241,-0.731196,1.262752
Sobha City,-3.550378,0.426642,1.332044,-0.627156,-1.860032,-2.022801,-1.183945,-0.495396,0.283977,-2.20268,...,1.076084,0.97325,0.705911,1.455225,-1.56163,0.33212,-0.567078,-1.685539,1.125736,0.437499
Signature Global City 93,0.356572,1.1266,-0.86359,-1.994747,0.426351,-0.295622,0.874,0.276328,-1.222104,0.453158,...,-1.547733,-0.480357,-0.320785,-0.971104,-0.204404,0.655946,1.39213,-0.827181,-0.312929,1.278996


In [47]:
cosine_sim3 = cosine_similarity(location_df_normalized)

In [48]:
cosine_sim3.shape

(246, 246)

In [54]:
def recommend_properties_with_scores(property_name, top_n = 247):

    cosine_sim_matrix = 0.5 * cosine_sim1 + 0.8 * cosine_sim2 + 1 * cosine_sim3
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))

    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # Get the indices and scores of thee top_n most similar properties 
    top_indices = [i[0] for i in sorted_scores[1: top_n + 1]]
    top_scores = [i[1] for i in sorted_scores[1: top_n + 1]]

    # Retrieve the names of the top properties using the indices 
    top_properties = location_df_normalized.index[top_indices].tolist()

    # Create a dataframe with the result 
    recommendations_df = pd.DataFrame({
        'PropertyName' : top_properties,
        'SimilarityScore' : top_scores
    })

    return recommendations_df


recommend_properties_with_scores('DLF The Camellias')

Unnamed: 0,PropertyName,SimilarityScore
0,Salcon The Verandas,0.821292
1,DLF The Magnolias,0.613285
2,DLF The Aralias,0.498701
3,Parsvnath Exotica,0.435815
4,Pioneer Urban Presidia,0.343247
...,...,...
240,DLF Princeton Estate,-0.290171
241,M3M Skycity,-0.292590
242,M3M Sierra 68,-0.293934
243,Godrej Nature Plus Serenity,-0.315705


In [50]:
x = location_df[location_df['Bajghera Road'] < 2000]['Bajghera Road'].sort_values().to_dict()
for key, value in x.items():
    print(key,value)

M3M Crown 550.0
Smartworld One DXP 800.0
Sobha City 1500.0


In [53]:
import pickle 

# pickle.dump(location_df, open('location_distance.pkl','wb'))
pickle.dump(cosine_sim1, open('cosine_sim1.pkl','wb'))
pickle.dump(cosine_sim2, open('cosine_sim2.pkl','wb'))
pickle.dump(cosine_sim3, open('cosine_sim3.pkl','wb'))

In [52]:
location_df

Unnamed: 0_level_0,Bajghera Road,Palam Vihar Halt,DPSG Palam Vihar,Park Hospital,Gurugram Railway Station,NorthCap University,Dwarka Expressway,Hyatt Place Gurgaon Udyog Vihar,"Dwarka Sector 21, Metro Station",Pacific D21 Mall,...,MCC Cricket Ground Dhankot,The Shri Ram School Aravali,Taj City Centre Gurugram,Minda Industries Corporate Office,"Rampura Flyover, Naurangpur Rd",Manesar toll plaza - Kherki Daula,"Imt Manesar, Gurugram",Holiday Inn,Sector 84 Road,Skyview Corporate Park
PropertyName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Smartworld One DXP,800.0,2500.0,3100.0,3100.0,4900.0,5400.0,1200.0,7700.0,7200.0,7400.0,...,20427.1,24488.6,18621.0,12666.4,11022.5,16540.4,16180.0,23952.2,19266.1,11990.1
M3M Crown,550.0,11466.2,14937.4,17890.0,21305.9,6700.0,3800.0,15396.7,15624.3,7500.0,...,18173.5,16092.7,22228.6,24974.4,19464.7,21413.3,11373.2,17748.4,17047.5,13214.4
Adani Brahma Samsara Vilasa,5300.0,20602.8,15061.7,24005.0,2500.0,8800.0,700.0,22365.5,12090.8,12437.5,...,15360.8,12780.9,12449.1,11501.1,13535.0,14440.7,12876.0,17965.1,14864.0,23246.8
Sobha City,1500.0,19546.3,23675.3,13116.7,6500.0,6700.0,5100.0,15219.0,18910.0,8200.0,...,22363.6,22075.7,20559.6,24159.0,11287.1,19151.0,15260.4,10905.6,23123.4,19731.1
Signature Global City 93,19518.1,22592.3,14020.1,5500.0,18842.3,15382.2,20437.9,18704.2,12395.1,19485.9,...,11572.7,15632.3,16347.2,13602.1,16979.2,20541.7,23499.4,14364.7,16724.4,23316.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DLF Princeton Estate,13615.6,22885.8,18597.0,22304.2,15808.7,16684.5,16271.3,11181.7,21284.7,20480.8,...,11227.2,20559.2,15721.4,22449.8,20709.5,13068.4,19832.2,20912.5,23829.8,13621.4
Pyramid Urban Homes 2,16809.3,13027.5,18382.7,11988.3,18679.9,20906.1,21792.6,16458.9,23489.4,13501.1,...,10766.4,14543.2,12967.5,21107.4,23707.6,17524.5,21098.3,12490.2,15960.1,12690.9
Satya The Hermitage,18521.8,24813.7,13207.5,20318.8,16629.9,14379.7,17090.2,15693.2,14598.7,15873.0,...,17047.3,19595.4,16282.8,13720.1,15384.7,13568.7,20813.2,16095.2,23391.8,20531.1
BPTP Spacio,24813.9,12174.0,11894.7,15020.5,10564.2,21493.3,17376.8,15968.9,15541.1,23039.1,...,12993.7,11213.7,14417.1,13693.5,23284.9,15645.7,22097.5,18715.8,21732.2,21578.9
