# <b> <div style='padding:25px;background-color:#9B2335;color:white;border-radius:4px;font-size:100%;text-align: center'>Singapore  Resale Flat Prices Prediction<br> </div>

## <p style="color:Khaki;"> Importing required libraries</p>

In [1]:
import pandas as pd
pd.set_option("display.precision", 4)
pd.set_option('display.float_format', '{:.4f}'.format)


import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy.stats import pearsonr,kstest, norm, skew,kurtosis, boxcox
from statsmodels.formula.api import ols
import statsmodels.api as sm
import scipy.stats as stats
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import pickle

sns.set_theme(context='notebook', style='white', palette='dark', font='sans-serif', font_scale=1, color_codes=True, rc=None)
sns.set_style({'axes.facecolor': 'white','axes.edgecolor': 'white'})

color_good = '\033[32m'   # Dark Green for near-zero skew
color_okay = '\033[92m'   # Light Green for slightly positive or slightly negative skew
color_bad = '\033[91m'    # Maroon for significant skew
color_neutral = '\033[0m' # Reset color


---
title: "EDA Before Handling Outlier & Skewness"
author: "Santhoh Kumar"
format:
  html:
    toc: true
    html-math-method: katex
    css: styles.css
---


## <span style="color:Khaki;">Reading Transformed Dataset

In [2]:
df1=pd.read_feather('data_trans.feather')

In [3]:
df1.sample(10)

Unnamed: 0,month,town,flat_type,floor_area_sqm,flat_model,lease_commence_date,resale_price,floors,floor_no,year,full_address,price_per_sqm,remaining_lease_year,resale_price_boxcox,floor_area_boxcox
223621,12,Jurong West,4 Room,103.0,Model A,1988,193000.0,3,9,2006,917 Jurong West St 91 Jurong West,1873.7864,81,471.9889,11.2966
42695,3,Hougang,Executive,148.0,Maisonette,1986,280000.0,3,9,1993,712 Hougang Ave 2 Hougang,1891.8919,92,556.0573,13.144
73978,1,Choa Chu Kang,5 Room,135.0,Model A,1996,351800.0,3,12,2002,612 Choa Chu Kang St 62 Choa Chu Kang,2605.9259,93,614.8474,12.6534
23783,11,Jurong West,4 Room,104.0,Model A,1984,100000.0,3,9,1991,516 Jurong West St 52 Jurong West,961.5385,92,353.1796,11.3429
94122,6,Jurong West,3 Room,67.0,New Generation,1983,107000.0,3,3,1995,535 Jurong West St 52 Jurong West,1597.0149,87,363.885,9.385
61077,2,Bedok,5 Room,121.0,Improved,1980,282000.0,3,9,1994,715 Bedok Reservoir Rd Bedok,2330.5785,85,557.8029,12.0891
19861,8,Bukit Batok,4 Room,90.0,Simplified,1986,180000.0,3,3,2000,309 Bt Batok St 31 Bukit Batok,2000.0,85,457.7041,10.6659
14245,10,Punggol,5 Room,110.0,Improved,2003,470000.0,3,12,2015,199C Punggol Field Punggol,4272.7273,87,698.444,11.6148
12092,9,Clementi,3 Room,67.0,New Generation,1980,339000.0,3,6,2015,308 Clementi Ave 4 Clementi,5059.7015,64,604.8975,9.385
35977,1,Hougang,3 Room,73.0,New Generation,1978,148000.0,3,3,2001,21 Hougang Ave 3 Hougang,2027.3973,76,419.8637,9.7438


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 927465 entries, 0 to 181261
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   month                 927465 non-null  category
 1   town                  927465 non-null  category
 2   flat_type             927465 non-null  category
 3   floor_area_sqm        927465 non-null  float32 
 4   flat_model            927465 non-null  category
 5   lease_commence_date   927465 non-null  category
 6   resale_price          927465 non-null  float64 
 7   floors                927465 non-null  category
 8   floor_no              927465 non-null  int8    
 9   year                  927465 non-null  category
 10  full_address          927465 non-null  category
 11  price_per_sqm         927465 non-null  float64 
 12  remaining_lease_year  927465 non-null  category
 13  resale_price_boxcox   927465 non-null  float64 
 14  floor_area_boxcox     927465 non-null  fl

## <span style="color:Khaki;">Defining variable type

In [5]:
continuous = ['floor_area_sqm', 'resale_price','resale_price_boxcox','floor_area_boxcox']

categorical = ['year','month' ,'town', 'flat_type','flat_model','lease_commence_date','floors','floor_no','remaining_lease_year']

## <p Style="color:Khaki"> Exploratory Data Analysis. After Handling Skewness and Outliers

In [None]:
skewed_col = []


for i in continuous:
    univar_num(df1,i) 
    skew_val = df1[i].skew()
    kurt=df1[i].kurtosis()
    kurt_color = kurtosis_color(kurt)
    color = skewness_color(skew_val)
    skewed_col.append(i)

    print(f"\n\n{color}Skew for {i} is {skew_val:.4f}{color_neutral}")
    print(f"{color}Kurtosis for {i} is {kurt:.4f}{color_neutral}\n\n")

<span style="color: Chartreuse;">   &#9784; &nbsp;</span> <span style="color: Tomato;font-size: 130%"> <u>**Floor Area SQM - Observations**</u></span><br><br><span Style="color: DodgerBlue;font-size: 120%">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &#9830; </span> <span Style="color: white;font-size: 110%">After the Box-Cox transformation we can see a significant reduction in the Skewness (almost 96%). </span><br>

<span Style="color: DodgerBlue;font-size: 120%">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &#9830; </span> <span Style="color: white;font-size: 110%">The Box-Cox transformation slightly increased the negative kurtosis, indicating that the tails became even lighter. This might not be a significant concern. We will experiment the model's result and address this if we face any issues.  </span><br>

<span style="color: Chartreuse;">   &#9784; &nbsp;</span> <span style="color: Tomato;font-size: 130%"> <u>**Resale Price - Observations**</u></span><br><br><span Style="color: DodgerBlue;font-size: 120%">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &#9830;  </span> <span Style="color: white;font-size: 110%">The Box-Cox transformation appears to have been highly effective for Resale Price.</span><br>

### <span style="color:Khaki;">Outliers based on the Categorical Features:

In [None]:
for i in categorical:
    plt.figure(figsize=(16,5))
    sns.boxplot(data=df1,x=i,y="resale_price_boxcox",hue=i,palette='dark',legend=False)
    plt.xticks(rotation=25)
    plt.title(f'{i.title()} vs Transformed Resale Price')
    plt.tight_layout()
    plt.show()

In [None]:
for i in categorical:
    plt.figure(figsize=(16,6))
    sns.boxplot(data=df1,x=i,y="floor_area_boxcox",hue=i,palette='dark',legend=False,)
    plt.xticks(rotation=25)
    plt.title(f'{i.title()} vs Floor Area SQM')
    plt.tight_layout()
    plt.show()

<span style="color: Chartreuse;font-size:150%">   &#9784; </span> <span style="color: Tomato;font-size: 120%;background-color:"> **After data transformation we can see a distribution movement that spread outliers in the lower side as well as upper side. All of these data points are not an error it shows the higher floor area sqm, floor level, flat type and model that influencing the resae price.**</span>

In [None]:
for cat_var in categorical:
    analyze_categorical_relationship(df1, "resale_price_boxcox", cat_var)

In [None]:
for cat_var in categorical:
    analyze_categorical_relationship(df1, "floor_area_boxcox", cat_var)

In [None]:
plt.figure(figsize=(25,6))
plt.subplot(1,2,1)
sns.scatterplot(data=df1, x='floor_area_sqm', y="resale_price", color='maroon',s=100)

plt.subplot(1,2,2)
sns.scatterplot(data=df1, x='floor_area_boxcox', y="resale_price_boxcox", color='maroon',s=100)
plt.tight_layout()
plt.show()

<span style="color: Chartreuse;font-size:150%">   &#9784; </span> <span style="color: Tomato;font-size: 120%;background-color:"> **We can oberseve that the top and bottom area data points reduced and formed almost like a linear relationship.**</span>

### <span style="color:Khaki;">Relationship Between Categorical Features and Target</span>

In [None]:
for i in categorical:
    plt.figure(figsize=(25, 5))
    data=df1.groupby(i,observed=False)['resale_price_boxcox'].median().reset_index().sort_values('resale_price_boxcox')
    sns.barplot(data=data, x=data[i], y=data['resale_price_boxcox'],hue=i, order =data[i],legend=False,palette='dark',)
    plt.xticks(rotation=25)
    plt.title(f'Average resale_price_boxcox by {i}')
    plt.show()

<span Style="color: DodgerBlue;font-size: 130%;"> &#9830; </span> <span style="color: Tomato;font-size: 120%"> <b> Observations: </b> <br><span Style="color: DodgerBlue;font-size: 130%"> &nbsp;&nbsp;&nbsp;&nbsp; &#9830; </span><span style="color: white;font-size: 110%">I feel that the fluctuations are curved when comparing to the original plots.</span>

### <p Style="color: Khaki">Correlation - Pearson's and Spearman's </p>

In [None]:
cor_col = df1.select_dtypes(include='number')
correlation_matrix_pear = cor_col.corr()
plt.figure(figsize=(16,4))
plt.subplot(1,2,1)
sns.heatmap(correlation_matrix_pear,annot=True,fmt='.2f',cmap='coolwarm')
plt.title('Pearson Correlation Matrix')


cor_col = df1.select_dtypes(include='number')
correlation_matrix = cor_col.corr(method='spearman')
plt.subplot(1,2,2)
sns.heatmap(correlation_matrix,annot=True,fmt='.2f',cmap='coolwarm')
plt.title('Spearman Correlation Matrix')
plt.show()

<span Style="color: DodgerBlue;font-size: 130%;"> &#9830; </span> <span style="color: Tomato;font-size: 120%"> <b> Observations: </b> <br><span Style="color: DodgerBlue;font-size: 130%"> &nbsp;&nbsp;&nbsp;&nbsp; &#9830; </span><span style="color: white;font-size: 110%">We do not see visible differences.</span>

## <span style="color:Khaki;"> Encoding Categorical Features:</span> 

In [None]:
town_en = df1.groupby('town',observed=False)['resale_price_boxcox'].median().rank(method='first').reset_index().sort_values('resale_price_boxcox')
town_en_list = dict(zip(town_en['town'], town_en['resale_price_boxcox'].astype(int)))
print(town_en_list)

In [None]:
df1['town_en'] = df1['town'].map(town_en_list)
df1['town_en'] = df1['town_en'].astype('int8')

In [None]:
year_en = df1.groupby('year',observed=False)['resale_price_boxcox'].median().rank(method='first').reset_index().sort_values('resale_price_boxcox')
year_en_list = dict(zip(year_en['year'], year_en['resale_price_boxcox'].astype(int)))
print(year_en_list)

In [None]:
df1['year_en'] = df1['year'].map(year_en_list)
df1['year_en'] = df1['year_en'].astype('int8')

In [None]:
flat_type_en = df1.groupby('flat_type',observed=False)['resale_price_boxcox'].median().rank(method='first').reset_index().sort_values('resale_price_boxcox')
flat_type_en_list = dict(zip(flat_type_en['flat_type'], flat_type_en['resale_price_boxcox'].astype(int)))
print(flat_type_en_list)

In [None]:
df1['flat_type_en'] = df1['flat_type'].map(flat_type_en_list)
df1['flat_type_en'] = df1['flat_type_en'].astype('int8')

In [None]:
flat_model_en = df1.groupby('flat_model',observed=False)['resale_price_boxcox'].median().rank(method='first').reset_index().sort_values('resale_price_boxcox')
flat_model_en_list = dict(zip(flat_model_en['flat_model'], flat_model_en['resale_price_boxcox'].astype(int)))
print(flat_model_en_list)

In [None]:
df1['flat_model_en'] = df1['flat_model'].map(flat_model_en_list)
df1['flat_model_en'] = df1['flat_model_en'].astype('int8')

In [None]:
lease_commence_en = df1.groupby('lease_commence_date',observed=False)['resale_price_boxcox'].median().rank(method='first').reset_index().sort_values('resale_price_boxcox')
lease_commence_en_list = dict(zip(lease_commence_en['flat_model'], lease_commence_en['resale_price_boxcox'].astype(int)))
print(lease_commence_en_list)

In [None]:
df1.info()

In [None]:
for i in df1.columns:
    if '_en' in i:
        print(i)
        print(f"Max - {df1[i].max()}\n {df1[i].min()}" )
    else:
        print('else - ',i )

In [None]:
df1.sample(10)

## <span style="color:Khaki;"> Splitting train and test Data:</span> 

In [None]:
df1.columns

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df1.drop(['month','price_per_sqm','remaining_lease_year','resale_price_boxcox', ], axis=1), df6[['transformed_status']].values.ravel(), test_size=0.2, random_state=42)