In [42]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from IPython.display import display

In [43]:
df_data = pd.read_csv('data/House Price India.csv')
df_data.drop(columns=['Built Year', 'Renovation Year', 'id', 'Date', 'Lattitude', 'Longitude', 'Postal Code', 'waterfront present', 'number of views'], inplace=True)
df_data

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,5,2.50,3650,9050,2.0,5,10,3370,280,2880,5400,2,58,2380000
1,4,2.50,2920,4000,1.5,5,8,1910,1010,2470,4000,2,51,1400000
2,5,2.75,2910,9480,1.5,3,8,2910,0,2940,6600,1,53,1200000
3,4,2.50,3310,42998,2.0,3,9,3310,0,3350,42847,3,76,838000
4,3,2.00,2710,4500,1.5,4,8,1880,830,2060,4500,1,51,805000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14615,2,1.50,1556,20000,1.0,4,7,1556,0,2250,17286,3,76,221700
14616,3,2.00,1680,7000,1.5,4,7,1680,0,1540,7480,3,59,219200
14617,2,1.00,1070,6120,1.0,3,6,1070,0,1130,6120,2,64,209000
14618,4,1.00,1030,6621,1.0,4,6,1030,0,1420,6631,3,54,205000


In [44]:
Q1 = df_data.quantile(0.25)
Q3 = df_data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_data = df_data[~((df_data < lower_bound) | (df_data > upper_bound)).any(axis=1)]
df_data.reset_index(drop=True, inplace=True)
df_data

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,3,2.00,2710,4500,1.5,4,8,1880,830,2060,4500,1,51,805000
1,3,2.50,2600,4750,1.0,4,9,1700,900,2380,4750,1,67,790000
2,3,1.75,2240,10578,2.0,5,8,1550,690,1570,10578,3,71,750000
3,3,2.50,2390,6550,1.0,4,8,1440,950,2010,6550,1,73,750000
4,4,2.25,2200,11250,1.5,5,7,1300,900,2320,10814,2,53,698000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11219,3,1.75,1590,7931,1.0,3,7,1190,400,1680,7931,1,80,240000
11220,3,2.00,1680,7000,1.5,4,7,1680,0,1540,7480,3,59,219200
11221,2,1.00,1070,6120,1.0,3,6,1070,0,1130,6120,2,64,209000
11222,4,1.00,1030,6621,1.0,4,6,1030,0,1420,6631,3,54,205000


In [45]:
df_data['Area of the basement'].value_counts().sort_index()

Area of the basement
0       6861
10         2
20         1
40         1
50        10
        ... 
1410       4
1420       6
1430       1
1440       4
1450       7
Name: count, Length: 174, dtype: int64

In [46]:
df_rule = pd.DataFrame()
df_rule['number of bedrooms'] = df_data['number of bedrooms'].apply(lambda x: 'low' if x < df_data['number of bedrooms'].max() // 2 else 'medium')
df_rule['number of bathrooms'] = df_data['number of bathrooms'].apply(lambda x: 'low' if x < df_data['number of bathrooms'].max() // 2 else 'medium')
df_rule['living area'] = df_data['living area'].apply(lambda x: 'low' if x < df_data['living area'].max() // 2 else 'medium')
df_rule['lot area'] = df_data['lot area'].apply(lambda x: 'low' if x < df_data['lot area'].max() // 3 else 'medium' if x < df_data['lot area'].max() * 2 // 3 else 'high')
df_rule['number of floors'] = df_data['number of floors'].apply(lambda x: 'low' if x < df_data['number of floors'].max() // 2 else 'medium')
df_rule['condition of the house'] = df_data['condition of the house'].apply(lambda x: 'low' if x < df_data['condition of the house'].max() // 2 else 'medium')
df_rule['grade of the house'] = df_data['grade of the house'].apply(lambda x: 'low' if x < df_data['grade of the house'].max() // 2 else 'medium')
df_rule['Area of the house(excluding basement)'] = df_data['Area of the house(excluding basement)'].apply(lambda x: 'low' if x < df_data['Area of the house(excluding basement)'].max() // 3 else 'medium' if x < df_data['Area of the house(excluding basement)'].max() * 2 // 3 else 'high')
df_rule['Area of the basement'] = df_data['Area of the basement'].apply(lambda x: 'low' if x < df_data['Area of the basement'].max() // 3 else 'medium' if x < df_data['Area of the basement'].max() * 2 // 3 else 'high')
df_rule['living_area_renov'] = df_data['living_area_renov'].apply(lambda x: 'low' if x < df_data['living_area_renov'].max() // 3 else 'medium' if x < df_data['living_area_renov'].max() * 2 // 3 else 'high')
df_rule['lot_area_renov'] = df_data['lot_area_renov'].apply(lambda x: 'low' if x < df_data['lot_area_renov'].max() // 3 else 'medium' if x < df_data['lot_area_renov'].max() * 2 // 3 else 'high')
df_rule['Number of schools nearby'] = df_data['Number of schools nearby'].apply(lambda x: 'low' if x < df_data['Number of schools nearby'].max() // 2 else 'medium')
df_rule['Distance from the airport'] = df_data['Distance from the airport'].apply(lambda x: 'low' if x < df_data['Distance from the airport'].max() // 2 else 'medium')
df_rule['Price'] = df_data['Price'].apply(lambda x: 'low' if x < df_data['Price'].max() // 3 else 'medium' if x < df_data['Price'].max() * 2 // 3 else 'high')

In [49]:
df_rule.drop_duplicates(inplace=True)
df_rule.reset_index(drop=True,inplace=True)

In [50]:
df_rule

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,medium,medium,medium,low,medium,medium,medium,medium,medium,medium,low,medium,medium,high
1,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium
2,medium,medium,low,low,medium,medium,medium,medium,low,medium,low,medium,medium,medium
3,medium,medium,low,low,medium,medium,medium,medium,low,medium,medium,medium,medium,medium
4,medium,medium,medium,medium,medium,medium,medium,high,low,medium,medium,medium,medium,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,medium,medium,medium,medium,medium,medium,medium,medium,low,high,low,medium,medium,high
436,medium,medium,low,high,medium,medium,medium,medium,low,medium,low,medium,medium,high
437,medium,low,low,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium,medium
438,medium,medium,medium,low,medium,medium,medium,medium,low,high,high,medium,medium,medium


In [51]:
dict_rule_to_index = {
    'low': 0,
    'medium': 1,
    'high': 2

}

In [52]:
df_rule = df_rule.applymap(lambda x: dict_rule_to_index[x] if x in dict_rule_to_index else x)

  df_rule = df_rule.applymap(lambda x: dict_rule_to_index[x] if x in dict_rule_to_index else x)


In [53]:
df_rule

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,1,1,1,0,1,1,1,1,1,1,0,1,1,2
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,0,0,1,1,1,1,0,1,0,1,1,1
3,1,1,0,0,1,1,1,1,0,1,1,1,1,1
4,1,1,1,1,1,1,1,2,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,1,1,1,1,1,1,1,1,0,2,0,1,1,2
436,1,1,0,2,1,1,1,1,0,1,0,1,1,2
437,1,0,0,1,1,1,1,1,1,1,1,1,1,1
438,1,1,1,0,1,1,1,1,0,2,2,1,1,1


In [54]:
def down_function(x, a, b):
    result = (b - x) / (b - a)
    result = round(result, 2)
    return max(result, 0)

def up_function(x, a, b):
    result = (x - a) / (b - a)
    result = round(result, 2)
    return max(result, 0)

def triangle_function(x, a, b, c):
    if x <= a or x >= c:
        return 0
    if a < x <= b:
        return up_function(x, a, b)
    if b < x < c:
        return down_function(x, b, c)

In [63]:
def bedroom_membership(x):
    low = down_function(x, df_data['number of bedrooms'].min(), df_data['number of bedrooms'].max())
    medium = up_function(x, df_data['number of bedrooms'].min(), df_data['number of bedrooms'].max())
    return low, medium

def bathroom_membership(x):
    low = down_function(x, df_data['number of bathrooms'].min(), df_data['number of bathrooms'].max())
    medium = up_function(x, df_data['number of bathrooms'].min(), df_data['number of bathrooms'].max())
    return low, medium

def living_area_membership(x):
    low = down_function(x, df_data['living area'].min(), df_data['living area'].max())
    medium = up_function(x, df_data['living area'].min(), df_data['living area'].max())
    return low, medium

def lot_area_membership(x):
    low = down_function(x, df_data['lot area'].min(), df_data['lot area'].max() // 2)
    medium = triangle_function(x, df_data['lot area'].min(), df_data['lot area'].max() // 2, df_data['lot area'].max())
    high = up_function(x, df_data['lot area'].max() // 2, df_data['lot area'].max())
    return low, medium, high

def number_of_floors_membership(x):
    low = down_function(x, df_data['number of floors'].min(), df_data['number of floors'].max())
    medium = up_function(x, df_data['number of floors'].min(), df_data['number of floors'].max())
    return low, medium

def condition_of_the_house_membership(x):
    low = down_function(x, df_data['condition of the house'].min(), df_data['condition of the house'].max())
    medium = up_function(x, df_data['condition of the house'].min(), df_data['condition of the house'].max())
    return low, medium

def grade_of_the_house_membership(x):
    low = down_function(x, df_data['grade of the house'].min(), df_data['grade of the house'].max())
    medium = up_function(x, df_data['grade of the house'].min(), df_data['grade of the house'].max())
    return low, medium

def area_of_the_house_membership(x):
    low = down_function(x, df_data['Area of the house(excluding basement)'].min(), df_data['Area of the house(excluding basement)'].max() // 2)
    medium = triangle_function(x, df_data['Area of the house(excluding basement)'].min(), df_data['Area of the house(excluding basement)'].max() // 2, df_data['Area of the house(excluding basement)'].max())
    high = up_function(x, df_data['Area of the house(excluding basement)'].max() // 2, df_data['Area of the house(excluding basement)'].max())
    return low, medium, high

def area_of_the_basement_membership(x):
    low = down_function(x, df_data['Area of the basement'].min(), df_data['Area of the basement'].max() // 2)
    medium = triangle_function(x, df_data['Area of the basement'].min(), df_data['Area of the basement'].max() // 2, df_data['Area of the basement'].max())
    high = up_function(x, df_data['Area of the basement'].max() // 2, df_data['Area of the basement'].max())
    return low, medium, high

def living_area_renov_membership(x):
    low = down_function(x, df_data['living_area_renov'].min(), df_data['living_area_renov'].max() // 2)
    medium = triangle_function(x, df_data['living_area_renov'].min(), df_data['living_area_renov'].max() // 2, df_data['living_area_renov'].max())
    high = up_function(x, df_data['living_area_renov'].max() // 2, df_data['living_area_renov'].max())
    return low, medium, high

def lot_area_renov_membership(x):
    low = down_function(x, df_data['lot_area_renov'].min(), df_data['lot_area_renov'].max() // 2)
    medium = triangle_function(x, df_data['lot_area_renov'].min(), df_data['lot_area_renov'].max() // 2, df_data['lot_area_renov'].max())
    high = up_function(x, df_data['lot_area_renov'].max() // 2, df_data['lot_area_renov'].max())
    return low, medium, high

def number_of_schools_nearby_membership(x):
    low = down_function(x, df_data['Number of schools nearby'].min(), df_data['Number of schools nearby'].max())
    medium = up_function(x, df_data['Number of schools nearby'].min(), df_data['Number of schools nearby'].max())
    return low, medium

def distance_from_the_airport_membership(x):
    low = down_function(x, df_data['Distance from the airport'].min(), df_data['Distance from the airport'].max())
    medium = up_function(x, df_data['Distance from the airport'].min(), df_data['Distance from the airport'].max())
    return low, medium

In [64]:
def calculate_membership(df_data):
    membership_functions = {
        'number of bedrooms': bedroom_membership,
        'number of bathrooms': bathroom_membership,
        'living area': living_area_membership,
        'lot area': lot_area_membership,
        'number of floors': number_of_floors_membership,
        'condition of the house': condition_of_the_house_membership,
        'grade of the house': grade_of_the_house_membership,
        'Area of the house(excluding basement)': area_of_the_house_membership,
        'Area of the basement': area_of_the_basement_membership,
        'living_area_renov': living_area_renov_membership,
        'lot_area_renov': lot_area_renov_membership,
        'Number of schools nearby': number_of_schools_nearby_membership,
        'Distance from the airport': distance_from_the_airport_membership
    }
    df_result = pd.DataFrame()
    for column, membership_function in membership_functions.items():
        df_result[column] = df_data[column].apply(lambda x: membership_function(x))
    return df_result

In [65]:
df_membership = calculate_membership(df_data.drop(columns=['Price']))

In [66]:
df_membership.reset_index(drop=True, inplace=True)

In [67]:
display(df_membership)
display(df_rule.drop(columns=['Price']))

Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport
0,"(0.67, 0.33)","(0.55, 0.45)","(0.41, 0.59)","(0.57, 0.43, 0)","(0.8, 0.2)","(0.33, 0.67)","(0.33, 0.67)","(0.01, 0.99, 0)","(0, 0.86, 0.14)","(0, 0.89, 0.11)","(0.53, 0.47, 0)","(1.0, 0.0)","(0.97, 0.03)"
1,"(0.67, 0.33)","(0.36, 0.64)","(0.44, 0.56)","(0.54, 0.46, 0)","(1.0, 0.0)","(0.33, 0.67)","(0.0, 1.0)","(0.14, 0.86, 0)","(0, 0.76, 0.24)","(0, 0.71, 0.29)","(0.5, 0.5, 0)","(1.0, 0.0)","(0.43, 0.57)"
2,"(0.67, 0.33)","(0.64, 0.36)","(0.54, 0.46)","(0, 0.91, 0.09)","(0.6, 0.4)","(0.0, 1.0)","(0.33, 0.67)","(0.25, 0.75, 0)","(0.05, 0.95, 0)","(0.23, 0.77, 0)","(0, 0.8, 0.2)","(0.0, 1.0)","(0.3, 0.7)"
3,"(0.67, 0.33)","(0.36, 0.64)","(0.5, 0.5)","(0.34, 0.66, 0)","(1.0, 0.0)","(0.33, 0.67)","(0.33, 0.67)","(0.32, 0.68, 0)","(0, 0.69, 0.31)","(0, 0.91, 0.09)","(0.28, 0.72, 0)","(1.0, 0.0)","(0.23, 0.77)"
4,"(0.33, 0.67)","(0.45, 0.55)","(0.55, 0.45)","(0, 0.84, 0.16)","(0.8, 0.2)","(0.0, 1.0)","(0.67, 0.33)","(0.42, 0.58, 0)","(0, 0.76, 0.24)","(0, 0.75, 0.25)","(0, 0.77, 0.23)","(0.5, 0.5)","(0.9, 0.1)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11219,"(0.67, 0.33)","(0.64, 0.36)","(0.72, 0.28)","(0.19, 0.81, 0)","(1.0, 0.0)","(0.67, 0.33)","(0.67, 0.33)","(0.5, 0.5, 0)","(0.45, 0.55, 0)","(0.14, 0.86, 0)","(0.11, 0.89, 0)","(1.0, 0.0)","(0.0, 1.0)"
11220,"(0.67, 0.33)","(0.55, 0.45)","(0.69, 0.31)","(0.3, 0.7, 0)","(0.8, 0.2)","(0.33, 0.67)","(0.67, 0.33)","(0.15, 0.85, 0)","(1.0, 0, 0)","(0.25, 0.75, 0)","(0.16, 0.84, 0)","(0.0, 1.0)","(0.7, 0.3)"
11221,"(1.0, 0.0)","(0.91, 0.09)","(0.86, 0.14)","(0.39, 0.61, 0)","(1.0, 0.0)","(0.67, 0.33)","(1.0, 0.0)","(0.58, 0.42, 0)","(1.0, 0, 0)","(0.59, 0.41, 0)","(0.33, 0.67, 0)","(0.5, 0.5)","(0.53, 0.47)"
11222,"(0.33, 0.67)","(0.91, 0.09)","(0.87, 0.13)","(0.34, 0.66, 0)","(1.0, 0.0)","(0.33, 0.67)","(1.0, 0.0)","(0.61, 0.39, 0)","(1.0, 0, 0)","(0.35, 0.65, 0)","(0.27, 0.73, 0)","(0.0, 1.0)","(0.87, 0.13)"


Unnamed: 0,number of bedrooms,number of bathrooms,living area,lot area,number of floors,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport
0,1,1,1,0,1,1,1,1,1,1,0,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,0,0,1,1,1,1,0,1,0,1,1
3,1,1,0,0,1,1,1,1,0,1,1,1,1
4,1,1,1,1,1,1,1,2,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,1,1,1,1,1,1,1,1,0,2,0,1,1
436,1,1,0,2,1,1,1,1,0,1,0,1,1
437,1,0,0,1,1,1,1,1,1,1,1,1,1
438,1,1,1,0,1,1,1,1,0,2,2,1,1


In [68]:
def aggregate(df_membership, df_rule):
    alpha_predicate = np.zeros((df_membership.shape[0], df_rule.shape[0]))
    for i, row_member in df_membership.iterrows():
        for j, row_rule in df_rule.iterrows():
            temp = []
            for cell_member, cell_rule in zip(row_member, row_rule):
                temp.append(cell_member[cell_rule])
            alpha_predicate[i, j] = min(temp)
    return alpha_predicate

In [69]:
df_aggregated = aggregate(df_membership, df_rule.drop(columns=['Price']))
df_aggregated.shape

(11224, 440)

In [70]:
def inv_up_function(alpha, a, b):
    return alpha*(b-a) + a

def inv_down_function(alpha, a, b):
    return b - alpha*(b-a)

def calculate_price(df_aggregated, prices):
    list_price = []
    for i in range(df_aggregated.shape[0]):
        dict_price = {
            'a_pred': [],
            'z': []
         }
        for j, price in enumerate(prices):
            if price == 0:
                dict_price['a_pred'].append(df_aggregated[i, j])
                dict_price['z'].append(inv_down_function(df_aggregated[i, j], df_data['Price'].min(), df_data['Price'].max()//2))   
            elif price == 1:
                dict_price['a_pred'].append(df_aggregated[i, j])
                dict_price['z'].append(inv_up_function(df_aggregated[i, j], df_data['Price'].min(), df_data['Price'].max()//2)) 
                dict_price['a_pred'].append(df_aggregated[i, j])
                dict_price['z'].append(inv_down_function(df_aggregated[i, j], df_data['Price'].max()//2, df_data['Price'].max()))
            elif price == 2:
                dict_price['a_pred'].append(df_aggregated[i, j])
                dict_price['z'].append(inv_up_function(df_aggregated[i, j], df_data['Price'].max()//2, df_data['Price'].max()))
        list_price.append(dict_price)
    return list_price

In [73]:
levels = calculate_price(df_aggregated, df_rule['Price']) 

In [74]:
def defuzzification(levels):
    result = []
    for level in levels:
        total_weighted_z = 0
        total_weight = 0
        for a, z in zip(level['a_pred'], level['z']):
            total_weighted_z += a * z
            total_weight += a
        centroid = total_weighted_z / total_weight if total_weight else 0
        result.append(centroid)
    return result

In [75]:
predicted_level = defuzzification(levels)

In [77]:
error = []
for y, y_hat in zip(df_data['Price'], predicted_level):
    error.append(abs(y - y_hat))

print(f'Mean relative error: {np.mean(error)}')

Mean relative error: 356880.1041217057
