# Import Libraries

In [5]:
import numpy as np
import pandas as pd
import requests
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Import Dataset

In [6]:
# url = "https://github.com/mazidzomader/CSE422-Project-Academic_Success_Analysis/blob/main/Dataset/academic_success_dataset.csv?raw=true"

# df = pd.read_csv(url)
# display(df.head())

In [7]:
# Source - https://stackoverflow.com/a
# Posted by John Smith
# Retrieved 2025-12-08, License - CC BY-SA 4.0

# define parameters for a request
token = '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'
owner = 'mazidzomader'
repo = 'CSE422-Project-Flat_Price_Prediction'
path = 'Dataset/flat_price_dataset.csv'

# send a request
r = requests.get(
    'https://api.github.com/repos/{owner}/{repo}/contents/{path}'.format(
    owner=owner, repo=repo, path=path),
    headers={
        'accept': 'application/vnd.github.v3.raw',
        'authorization': 'token {}'.format(token)
            }
    )

# convert string to StringIO object
string_io_obj = StringIO(r.text)

# Load data to df
df = pd.read_csv(string_io_obj, sep=",", index_col=False)

# optionally write df to CSV
display(df.head(10))


Unnamed: 0,Location,Size_sqft,Num_Bedrooms,Num_Bathrooms,Has_Balcony,Floor_Number,Building_Age_Years,Parking_Available,Nearby_Schools,Distance_to_CityCenter_km,Security_Level,Price_Category
0,Countryside,1730.0,4.0,3.0,No,3.0,24.0,No,Few,19.2,Low,High
1,City Center,2092.0,1.0,2.0,No,16.0,36.0,No,Many,2.6,Low,High
2,Countryside,,3.0,1.0,Yes,8.0,47.0,No,Many,,Medium,Medium
3,,1945.0,3.0,2.0,Yes,18.0,42.0,Yes,Few,28.8,Low,High
4,City Center,1505.0,4.0,3.0,No,4.0,37.0,No,Many,2.8,Low,High
5,City Center,2154.0,1.0,3.0,Yes,5.0,40.0,Yes,Many,16.2,High,High
6,Countryside,866.0,2.0,,No,2.0,44.0,Yes,Few,9.4,High,High
7,Suburbs,,3.0,2.0,No,,23.0,No,Few,12.1,High,Medium
8,Countryside,2234.0,4.0,2.0,Yes,19.0,,Yes,,5.8,,High
9,Countryside,1632.0,1.0,1.0,,,,No,Many,17.6,,High


# Data Preprocessing

## Data inspection

In [8]:
print("   Unedited Dataset summary\n==============================")
rows, cols = df.shape
print("No of Rows:", rows,"||  No of Columns:", cols)

print("\n   Datatypes of Columns\n==============================")
print(df.dtypes)
print("\n   Missing Values in Columns\n==============================")
print(df.isnull().sum())
# print("\n   Unique Values in Columns\n==============================")
# for col in df.columns:
#     print(f"Unique values in column '{col}': {df[col].unique()}")

print("\n   Duplicate Rows\n==============================")
print("Number of duplicates:", df.duplicated().sum())

print("\n   Numerical Column Statistics\n==============================")
print(df.describe())

   Unedited Dataset summary
No of Rows: 1200 ||  No of Columns: 12

   Datatypes of Columns
Location                      object
Size_sqft                    float64
Num_Bedrooms                 float64
Num_Bathrooms                float64
Has_Balcony                   object
Floor_Number                 float64
Building_Age_Years           float64
Parking_Available             object
Nearby_Schools                object
Distance_to_CityCenter_km    float64
Security_Level                object
Price_Category                object
dtype: object

   Missing Values in Columns
Location                     140
Size_sqft                    106
Num_Bedrooms                 136
Num_Bathrooms                143
Has_Balcony                  121
Floor_Number                 132
Building_Age_Years           127
Parking_Available            101
Nearby_Schools               124
Distance_to_CityCenter_km    103
Security_Level               126
Price_Category                 0
dtype: int64

   Duplica

## Removing Irrelevant Column
 `There is no irrelevant column to remove.`

## Handle Missing Values

In [9]:
mean_features = ['Size_sqft', ]
median_features = ['Num_Bedrooms', 'Num_Bathrooms','Floor_Number', 'Building_Age_Years', 'Distance_to_CityCenter_km']
mode_features = ['Has_Balcony', 'Parking_Available', 'Nearby_Schools', 'Security_Level', 'Location' ]

impute_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
impute_mean.fit(df[mean_features])

df[mean_features] = impute_mean.transform(df[mean_features])

impute_median = SimpleImputer(missing_values=np.nan, strategy='median')
impute_median.fit(df[median_features])

df[median_features] = impute_median.transform(df[median_features])

impute_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
impute_mode.fit(df[mode_features])

df[mode_features] = impute_mode.transform(df[mode_features])

In [10]:
print("Missing values after Imputation\n==============================")
print(df.isnull().sum())

Missing values after Imputation
Location                     0
Size_sqft                    0
Num_Bedrooms                 0
Num_Bathrooms                0
Has_Balcony                  0
Floor_Number                 0
Building_Age_Years           0
Parking_Available            0
Nearby_Schools               0
Distance_to_CityCenter_km    0
Security_Level               0
Price_Category               0
dtype: int64


## Handle Duplicates

In [11]:
df.drop_duplicates(inplace=True)

## Encoding

In [12]:
df['Location'].unique()

array(['Countryside', 'City Center', 'Suburbs'], dtype=object)

In [13]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import pandas as pd

print("="*60)
print("           ENCODING CATEGORICAL FEATURES")
print("="*60)

# 1. LABEL ENCODING for binary features
print("\n1. Label Encoding (Binary Features)...")
binary_cols = ['Has_Balcony', 'Parking_Available', 'Nearby_Schools']

le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])
    print(f"  ✓ {col}: {dict(zip(le.classes_, range(len(le.classes_))))}")

# 2. ORDINAL ENCODING for ordered categories
print("\n2. Ordinal Encoding (Ordered Features)...")

# Security_Level
security_order = [['Low', 'Medium', 'High']]
security_encoder = OrdinalEncoder(categories=security_order)
df['Security_Level'] = security_encoder.fit_transform(df[['Security_Level']])
print(f"  ✓ Security_Level: Low=0, Medium=1, High=2")

# Price_Category (if ordinal)
# Check unique values first
print(f"\n  Price_Category unique values: {df['Price_Category'].unique()}")
# If it's ordinal (Low/Medium/High), encode similarly
# If it's the TARGET variable, you might want to keep it or encode last

# 3. ONE-HOT ENCODING for Location
print("\n3. One-Hot Encoding (Nominal Features)...")
print(f"  Original columns: {len(df.columns)}")

df = pd.get_dummies(df, columns=['Location'], prefix='Loc', drop_first=False)

print(f"  After encoding: {len(df.columns)} columns")
print(f"  New location columns: {[col for col in df.columns if col.startswith('Loc_')]}")

print("\n✅ Encoding complete!")
print("\nFinal dataset shape:", df.shape)
print("\nColumn data types:")
print(df.dtypes)

           ENCODING CATEGORICAL FEATURES

1. Label Encoding (Binary Features)...
  ✓ Has_Balcony: {'No': 0, 'Yes': 1}
  ✓ Parking_Available: {'No': 0, 'Yes': 1}
  ✓ Nearby_Schools: {'Few': 0, 'Many': 1}

2. Ordinal Encoding (Ordered Features)...
  ✓ Security_Level: Low=0, Medium=1, High=2

  Price_Category unique values: ['High' 'Medium' 'Low']

3. One-Hot Encoding (Nominal Features)...
  Original columns: 12
  After encoding: 14 columns
  New location columns: ['Loc_City Center', 'Loc_Countryside', 'Loc_Suburbs']

✅ Encoding complete!

Final dataset shape: (1200, 14)

Column data types:
Size_sqft                    float64
Num_Bedrooms                 float64
Num_Bathrooms                float64
Has_Balcony                    int64
Floor_Number                 float64
Building_Age_Years           float64
Parking_Available              int64
Nearby_Schools                 int64
Distance_to_CityCenter_km    float64
Security_Level               float64
Price_Category                objec

In [14]:
df.head(20)

Unnamed: 0,Size_sqft,Num_Bedrooms,Num_Bathrooms,Has_Balcony,Floor_Number,Building_Age_Years,Parking_Available,Nearby_Schools,Distance_to_CityCenter_km,Security_Level,Price_Category,Loc_City Center,Loc_Countryside,Loc_Suburbs
0,1730.0,4.0,3.0,0,3.0,24.0,0,0,19.2,0.0,High,False,True,False
1,2092.0,1.0,2.0,0,16.0,36.0,0,1,2.6,0.0,High,True,False,False
2,1379.874771,3.0,1.0,1,8.0,47.0,0,1,14.7,1.0,Medium,False,True,False
3,1945.0,3.0,2.0,1,18.0,42.0,1,0,28.8,0.0,High,True,False,False
4,1505.0,4.0,3.0,0,4.0,37.0,0,1,2.8,0.0,High,True,False,False
5,2154.0,1.0,3.0,1,5.0,40.0,1,1,16.2,2.0,High,True,False,False
6,866.0,2.0,2.0,0,2.0,44.0,1,0,9.4,2.0,High,False,True,False
7,1379.874771,3.0,2.0,0,10.0,23.0,0,0,12.1,2.0,Medium,False,False,True
8,2234.0,4.0,2.0,1,19.0,23.0,1,0,5.8,2.0,High,False,True,False
9,1632.0,1.0,1.0,0,10.0,23.0,0,1,17.6,2.0,High,False,True,False
