Day2.ipynb - We already have 800+ rows. Cleaning this data into useful insights will happen here.

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/home/kepha/nairobi_property/data/raw_listings.csv")
df_clean = df.copy()
print(f"Starting rows: {len(df_clean)}")

Starting rows: 2511


REMOVING DUPLICATES

In [3]:
print(f"Duplicates before: {df_clean.duplicated().sum()}")
df_clean = df_clean.drop_duplicates()

Duplicates before: 1612


In [4]:
print(f"Rows after removing duplicates: {len(df_clean)}")
print(f"Duplicates after: {df_clean.duplicated().sum()}")

Rows after removing duplicates: 899
Duplicates after: 0


REMOVING PRICE OUTLIERS

In [5]:
df_clean = df_clean[(df_clean['price_kes'] >= 1_000_000) & 
                    (df_clean['price_kes'] <= 500_000_000)]
print(f"After price filtering, 1M-500M: {len(df_clean)} rows remain")


After price filtering, 1M-500M: 890 rows remain


SORTING SIZE OUTLIERS

In [6]:
print(df_clean['size_sqft'].describe())

count       799.000000
mean       2172.765144
std        5192.789039
min          10.760000
25%         764.240000
50%        1259.390000
75%        2174.330000
max      107640.000000
Name: size_sqft, dtype: float64


In [7]:
# Identifying suspiciously small sizes, under 200 sqft but the price is over 10M KES
suspicious_small = (df_clean['size_sqft'] < 200) & (df_clean['price_kes'] > 10000000)
print(df_clean.loc[suspicious_small, ['location', 'bedrooms', 'size_sqft', 'price_kes']])


           location  bedrooms  size_sqft  price_kes
176        Kilimani         4     129.17   25000000
199       Westlands         2      96.88   14000000
234   Spring Valley         3      43.06   23100000
332        Kitisuru         4      32.29   60000000
339        Kitisuru         5      53.82   85000000
349     Kiambu Road         7      75.35   85000000
361       Westlands         2      96.88   14000000
364        Kilimani         4     129.17   25000000
446           Karen         5      75.35  170000000
458           Karen         4      53.82  105000000
481           Runda         4      53.82  120000000
483           Nyari         8      86.11  180000000
581       Westlands         3      21.53   28000000
588        Kitisuru         5      86.11  350000000
601         Loresho         4      43.06   75000000
650      Kileleshwa         2      96.88   90000000
733           Runda         5      64.58  350000000
763           Kyuna         4      32.29   48000000
768         

In [8]:
#Multiply the size by 10 for these suspicious entries, assuming a possible decimal point error
df_clean.loc[suspicious_small, 'size_sqft'] = df_clean.loc[suspicious_small, 'size_sqft'] * 10
print(df_clean.loc[suspicious_small, ['location', 'bedrooms', 'size_sqft', 'price_kes']])


           location  bedrooms  size_sqft  price_kes
176        Kilimani         4     1291.7   25000000
199       Westlands         2      968.8   14000000
234   Spring Valley         3      430.6   23100000
332        Kitisuru         4      322.9   60000000
339        Kitisuru         5      538.2   85000000
349     Kiambu Road         7      753.5   85000000
361       Westlands         2      968.8   14000000
364        Kilimani         4     1291.7   25000000
446           Karen         5      753.5  170000000
458           Karen         4      538.2  105000000
481           Runda         4      538.2  120000000
483           Nyari         8      861.1  180000000
581       Westlands         3      215.3   28000000
588        Kitisuru         5      861.1  350000000
601         Loresho         4      430.6   75000000
650      Kileleshwa         2      968.8   90000000
733           Runda         5      645.8  350000000
763           Kyuna         4      322.9   48000000
768         

In [9]:
# Add acres column to understand scale
df_clean['size_acres'] = df_clean['size_sqft'] / 43560

large_properties = df_clean[df_clean['size_sqft'] > 20000][
    ['location', 'bedrooms', 'size_sqft', 'size_acres', 'price_kes']
].sort_values('size_sqft', ascending=False)

print(large_properties)

       location  bedrooms  size_sqft  size_acres  price_kes
993    Kilimani         3  107640.00    2.471074   24000000
1806  Westlands         1   52722.07    1.210332    6200000
1108      Karen        11   46511.24    1.067751  210000000
1794      Runda         5   46511.24    1.067751   98000000
333     Loresho         4   43099.06    0.989418   75000000


In [10]:
# I have decided to remove the large properties that we multiplied by 10, as they are likely outliers and may skew the analysis. I will filter them out based on their size and price per sqft.
large_properties = df_clean[df_clean['size_sqft'] > 20000]
df_clean = df_clean[~df_clean.index.isin(large_properties.index)]

In [11]:
df_clean = df_clean[(df_clean['size_sqft'] >= 200) | (df_clean['size_sqft'].isna())]
print(f"After size filtering (>200 sqft): {len(df_clean)} rows")


After size filtering (>200 sqft): 881 rows


In [12]:

missing_sizes = df_clean['size_sqft'].isna().sum()
print(f"Rows with missing sizes: {missing_sizes}")

Rows with missing sizes: 91


In [13]:
#DRopping rows with missing sizes for now, as we cannot impute them without more information
df_clean = df_clean.dropna(subset=['size_sqft'])
print(f"After dropping missing sizes: {len(df_clean)} rows")

After dropping missing sizes: 790 rows


SORTING AMMENITIES

In [14]:
#dropping properties with no ammenities listed, as they are likely incomplete listings
df_clean = df_clean.dropna(subset=['amenities'])
print(f"After dropping missing amenities: {len(df_clean)} rows")

After dropping missing amenities: 759 rows


FINALLY : STANDARDIZING THE TEXTS

In [15]:
df_clean['location'] = df_clean['location'].str.title().str.strip()
df_clean['property_type'] = df_clean['property_type'].str.title().str.strip()

In [16]:
print(f"Cleaning complete, Kept {len(df_clean)} listings")

Cleaning complete, Kept 759 listings


FEATURE ENGINEERING

In [17]:
# Price per square foot
df_clean['price_per_sqft'] = df_clean['price_kes'] / df_clean['size_sqft']
print("Created: price_per_sqft")

Created: price_per_sqft


In [18]:
#Creating amenity score
#Counts amenities each property has by counting commas eg "Parking, Pool, Gym" has 2 commas = 3 amenities.

def count_amenities(amenities_str):
    if pd.isna(amenities_str) or amenities_str == 'None':
        return 0
    return amenities_str.count(',') + 1

df_clean['amenity_score'] = df_clean['amenities'].apply(count_amenities)
print("Created: amenity_score")

Created: amenity_score


In [19]:
# Month from listing date
df_clean['listing_date'] = pd.to_datetime(df_clean['listing_date'])
df_clean['month'] = df_clean['listing_date'].dt.month
print("Created: month")

Created: month


In [20]:
df_clean.drop(columns=['size_acres'], inplace=True)

SAVE TO CLEAN_LISTINGS CSV

In [21]:
df_clean.to_csv("/home/kepha/nairobi_property/data/clean_listings.csv", index=False)