In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("real-estate-india.csv")
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [3]:
df.describe()

Unnamed: 0,Total_Area,Price_per_SQFT,Baths
count,14528.0,14528.0,14528.0
mean,1297.916988,11719.456222,2.751239
std,1245.694305,49036.068632,0.898243
min,70.0,0.0,1.0
25%,650.0,4480.0,2.0
50%,1000.0,6050.0,3.0
75%,1439.0,9312.5,3.0
max,35000.0,999000.0,6.0


In [5]:
df.shape

(14528, 9)

In [6]:
df.columns

Index(['Name', 'Property Title', 'Price', 'Location', 'Total_Area',
       'Price_per_SQFT', 'Description', 'Baths', 'Balcony'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14528 non-null  object 
 1   Property Title  14528 non-null  object 
 2   Price           14528 non-null  object 
 3   Location        14528 non-null  object 
 4   Total_Area      14528 non-null  int64  
 5   Price_per_SQFT  14528 non-null  float64
 6   Description     14528 non-null  object 
 7   Baths           14528 non-null  int64  
 8   Balcony         14528 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1021.6+ KB


### Data cleaning

In [17]:
import re

# Updated function to convert 'Cr', 'Lakh', 'L', and handle invalid entries like 'acs'
def convert_price(price_str):
    # Remove the ₹ symbol and any commas
    price_str = price_str.replace('₹', '').replace(',', '').strip()

    # Handle the case where the price contains 'acs' or other irrelevant units
    if 'acs' in price_str.lower():
        return None  # Mark invalid entries with None, to be handled later
    
    # Convert 'Cr' (crore) to numerical value
    if 'Cr' in price_str:
        price_str = price_str.replace('Cr', '').strip()
        return float(price_str) * 1e7  # 1 crore = 10 million

    # Convert 'Lakh' or 'L' (lakh) to numerical value
    elif 'Lakh' in price_str or 'L' in price_str:
        price_str = re.sub(r'(Lakh|L)', '', price_str).strip()  # Remove 'Lakh' or 'L'
        return float(price_str) * 1e5  # 1 lakh = 100,000

    # If no 'Cr' or 'Lakh'/'L', assume it's a plain number
    try:
        return float(price_str)
    except ValueError:
        return None  # If it fails conversion, return None

# Apply the conversion function to the 'Price' column
df['Price'] = df['Price'].apply(convert_price)

# Drop rows where 'Price' is None (invalid entries)
df = df.dropna(subset=['Price'])

# Check the results
print(df['Price'].head())

0    19900000.0
1    22500000.0
2    10000000.0
3    33300000.0
4     4800000.0
Name: Price, dtype: float64


In [19]:
# Drop rows where 'Price' is None
df = df.dropna(subset=['Price'])

# Check the results
print(df['Price'].head())


0    19900000.0
1    22500000.0
2    10000000.0
3    33300000.0
4     4800000.0
Name: Price, dtype: float64


In [9]:
# Dropping irrelevant columns
df = df.drop(columns=["Name", "Property Title", "Description"])

In [10]:
df.head(2)

Unnamed: 0,Price,Location,Total_Area,Price_per_SQFT,Baths,Balcony
0,₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,4,Yes
1,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,6,Yes


In [11]:
# Check for missing values
df.isnull().sum()

Price             0
Location          0
Total_Area        0
Price_per_SQFT    0
Baths             0
Balcony           0
dtype: int64

In [12]:
# Only if there are missing values
'''
# Fill missing values in 'Baths' and 'Balcony' with the median of the column
df['Baths'].fillna(df['Baths'].median(), inplace=True)
df['Balcony'].fillna(df['Balcony'].median(), inplace=True)

# If there are any missing values in the target variable 'Price', drop those rows
df = df.dropna(subset=['Price'])
'''

"\n# Fill missing values in 'Baths' and 'Balcony' with the median of the column\ndf['Baths'].fillna(df['Baths'].median(), inplace=True)\ndf['Balcony'].fillna(df['Balcony'].median(), inplace=True)\n\n# If there are any missing values in the target variable 'Price', drop those rows\ndf = df.dropna(subset=['Price'])\n"

In [13]:
# Remove outliers in the data

# Removing outliers in Price using IQR (Interquartile Range)
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['Price'] >= (Q1 - 1.5 * IQR)) & (df['Price'] <= (Q3 + 1.5 * IQR))]

TypeError: unsupported operand type(s) for -: 'str' and 'str'