In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io

In [2]:
df = pd.read_csv("D:\Downloads\Real Estate Data V21.csv\Real Estate Data V21.csv")

In [3]:
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


# function to convert price column from string to integer 

In [4]:
def convert_price_to_rupees(price):
    """
    Converts a price string (e.g., "₹1.99 Cr", "₹48.0 L") into a numerical
    value in Rupees.

    Args:
        price (str or number): The price string or number to convert.

    Returns:
        float: The price in Rupees as a float, or NaN if conversion fails.
    """
    # If the price is not a string, it might already be a number.
    if not isinstance(price, str):
        try:
            return float(price)
        except (ValueError, TypeError):
            return np.nan

    # 1. Clean the string by removing the currency symbol and whitespace
    price_clean = price.replace('₹', '').strip()

    try:
        # 2. Check for 'Cr' (Crore) and convert
        if 'Cr' in price_clean:
            value = float(price_clean.replace('Cr', '').strip())
            return value * 10000000  # 1 Crore = 1,00,00,000 Rupees
        
        # 3. Check for 'L' (Lakh) and convert
        elif 'L' in price_clean:
            value = float(price_clean.replace('L', '').strip())
            return value * 100000  # 1 Lakh = 1,00,000 Rupees
            
        # If no unit is specified, assume it's already in Rupees
        else:
            return float(price_clean)
            
    except (ValueError, TypeError):
        # If conversion to float fails at any point, return Not a Number (NaN)
        return np.nan

# --- Demonstration of the function with your local file ---

# To use your data file, provide the file path directly to pd.read_csv().
# NOTE: This script must be run on your computer where the file is located.
# On Windows, it's best to use a "raw" string (r"...") to handle backslashes.

filepath = r"D:\Downloads\Real Estate Data V21.csv\Real Estate Data V21.csv"

try:
    # Read the data from your specified CSV file path
    df = pd.read_csv(filepath)
    print(f"Successfully loaded data from: {filepath}")
    print("\nOriginal DataFrame (first 5 rows):")
    print(df.head())
    print("\n" + "="*50 + "\n")

    # Apply the function to the entire 'Price' column
    df['Price_Rupees'] = df['Price'].apply(convert_price_to_rupees)

    print("DataFrame after applying the conversion function (first 5 rows):")
    print(df[['Price', 'Price_Rupees']].head())

    # Verify the data type of the new column
    print("\nData types of the DataFrame:")
    df.info()

except FileNotFoundError:
    print(f"Error: File not found at the specified path.")
    print(f"Please make sure the file exists at: {filepath}")
except Exception as e:
    print(f"An error occurred while reading or processing the file: {e}")

Successfully loaded data from: D:\Downloads\Real Estate Data V21.csv\Real Estate Data V21.csv

Original DataFrame (first 5 rows):
                                      Name  \
0                         Casagrand ECR 14   
1    Ramanathan Nagar, Pozhichalur,Chennai   
2                              DAC Prapthi   
3  Naveenilaya,Chepauk, Triplicane,Chennai   
4                 VGN Spring Field Phase 1   

                                      Property Title     Price  \
0  4 BHK Flat for sale in Kanathur Reddikuppam, C...  ₹1.99 Cr   
1  10 BHK Independent House for sale in Pozhichal...  ₹2.25 Cr   
2      3 BHK Flat for sale in West Tambaram, Chennai   ₹1.0 Cr   
3  7 BHK Independent House for sale in Triplicane...  ₹3.33 Cr   
4              2 BHK Flat for sale in Avadi, Chennai   ₹48.0 L   

                                   Location  Total_Area  Price_per_SQFT  \
0             Kanathur Reddikuppam, Chennai        2583          7700.0   
1     Ramanathan Nagar, Pozhichalur,Chennai   

## function for coverting yes or no data from balcony to 0 and 1

In [5]:
def process_categorical_data(df):
    """
    Handles conversion of categorical columns to a numerical format.
    - Converts 'Balcony' from 'Yes'/'No' to 1/0.
    - Fills any missing values with the mode.
    """
    print("Step 2: Handling 'Balcony' categorical data.")
    
    # Create a mapping dictionary and apply it
    balcony_map = {'Yes': 1, 'No': 0}
    df['Balcony_Numeric'] = df['Balcony'].map(balcony_map)

    # Fill any potential missing values with the mode (most frequent value)
    if not df['Balcony_Numeric'].empty:
        mode_balcony = df['Balcony_Numeric'].mode()[0]
        df['Balcony_Numeric'].fillna(mode_balcony, inplace=True)
    
    print("\nDataFrame after converting 'Balcony' to numeric (1 for Yes, 0 for No):")
    print(df[['Balcony', 'Balcony_Numeric']].head())
    return df

# --- Main Data Processing Steps ---

# Assume 'df' is a DataFrame that has already been loaded in a previous cell.
# For example:
# df = pd.read_csv(r"D:\Downloads\Real Estate Data V21.csv\Real Estate Data V21.csv")

# Make sure the DataFrame 'df' exists before running this cell.
if 'df' in locals() or 'df' in globals():
    print("DataFrame 'df' found. Starting data processing...")
    
    # --- Step 1: Price Cleaning ---
    df['Price_Rupees'] = df['Price'].apply(convert_price_to_rupees)
    print("\nStep 1: Cleaned 'Price' column and created 'Price_Rupees'.")
    print(df[['Price', 'Price_Rupees']].head())
    
    print("\n" + "="*50 + "\n")

    # --- Step 2: Handling Categorical Data (calling our new function) ---
    df = process_categorical_data(df)
    # Verify the final data types
    print("\n" + "="*50 + "\n")
    print("Final DataFrame Info:")
    df.info()

else:
    print("Error: DataFrame 'df' not found.")
    print("Please make sure you have loaded your data into a DataFrame named 'df' in a cell before this one.")

DataFrame 'df' found. Starting data processing...

Step 1: Cleaned 'Price' column and created 'Price_Rupees'.
      Price  Price_Rupees
0  ₹1.99 Cr    19900000.0
1  ₹2.25 Cr    22500000.0
2   ₹1.0 Cr    10000000.0
3  ₹3.33 Cr    33300000.0
4   ₹48.0 L     4800000.0


Step 2: Handling 'Balcony' categorical data.

DataFrame after converting 'Balcony' to numeric (1 for Yes, 0 for No):
  Balcony  Balcony_Numeric
0     Yes                1
1     Yes                1
2      No                0
3     Yes                1
4     Yes                1


Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             14528 non-null  object 
 1   Property Title   14528 non-null  object 
 2   Price            14528 non-null  object 
 3   Location         14528 non-null  object 
 4   Total_Area       14528 non-null  int64 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Balcony_Numeric'].fillna(mode_balcony, inplace=True)


## One-Hot Encode Location Column

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
def encode_location_data(df):
    """
    Performs one-hot encoding on the 'Location' column.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.

    Returns:
        pd.DataFrame: The DataFrame with new one-hot encoded location columns.
    """
    print("Step 3: 'Location' column one-hot encoded.")
    if 'Location' not in df.columns:
        print("Warning: 'Location' column not found.")
        return df
        
    location_dummies = pd.get_dummies(df['Location'], prefix='Location', dummy_na=False)
    df = pd.concat([df, location_dummies], axis=1)
    return df

# --- Main Pipeline Function ---
def run_price_prediction_pipeline(df):
    """
    Executes the full data cleaning, feature engineering, training,
    and evaluation pipeline.
    
    Args:
        df (pd.DataFrame): The raw DataFrame with housing data.

    Returns:
        None: Prints the evaluation results.
    """
    print("--- Starting Data Processing Pipeline ---")
    
    # Step 1: Clean Price Column
    df['Price_Rupees'] = df['Price'].apply(convert_price_to_rupees)
    df.dropna(subset=['Price_Rupees'], inplace=True)
    print("Step 1: Price column cleaned.")

    # Step 2: Handle Balcony Column
    balcony_map = {'Yes': 1, 'No': 0}
    df['Balcony_Numeric'] = df['Balcony'].map(balcony_map)
    if not df['Balcony_Numeric'].empty:
        mode_balcony = df['Balcony_Numeric'].mode()[0]
        df['Balcony_Numeric'].fillna(mode_balcony, inplace=True)
    print("Step 2: 'Balcony' column converted to numeric.")

    # Step 3: One-Hot Encode Location Column (calling our new function)
    df = encode_location_data(df)

    # Step 4: Feature Selection (X) and Target (y)
    numerical_features = ['Total_Area', 'Baths', 'Balcony_Numeric']
    location_features = [col for col in df.columns if col.startswith('Location_')]
    
    features = numerical_features + location_features
    X = df[features]
    y = df['Price_Rupees']
    
    X.fillna(0, inplace=True)
    print("Step 4: Features (X) and Target (y) selected.")

    # Step 5: Split Data into Training and Testing Sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Step 5: Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

    # Step 6: Train a Linear Regression Model
    model = LinearRegression()
    model.fit(X_train, y_train)
    print("\n--- Model Training Complete ---")

    # Step 7: Evaluate the Model
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    
    print("\n--- Model Evaluation Results ---")
    print(f"Root Mean Squared Error (RMSE): ₹{rmse:,.2f}")
    
    print("\nSample Predictions vs. Actual Prices:")
    results_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': predictions}).head()
    print(results_df.to_string(formatters={'Actual Price': '{:,.2f}'.format, 'Predicted Price': '{:,.2f}'.format}))


# --- Demonstration ---
# This script assumes 'df' is a DataFrame that has already been loaded.
# For example:
# df = pd.read_csv(r"D:\Downloads\Real Estate Data V21.csv\Real Estate Data V21.csv")

if 'df' in locals() or 'df' in globals():
    chennai_df = df[df['Location'].str.contains('Chennai', na=False, case=False)].copy()
    if not chennai_df.empty:
        run_price_prediction_pipeline(chennai_df)
    else:
        print("No properties found for Chennai in the DataFrame.")
else:
    print("Error: DataFrame 'df' not found.")
    print("Please make sure you have loaded your data into a DataFrame named 'df' in a cell before this one.")



--- Starting Data Processing Pipeline ---
Step 1: Price column cleaned.
Step 2: 'Balcony' column converted to numeric.
Step 3: 'Location' column one-hot encoded.
Step 4: Features (X) and Target (y) selected.
Step 5: Data split into 1276 training samples and 319 testing samples.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Balcony_Numeric'].fillna(mode_balcony, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)



--- Model Training Complete ---

--- Model Evaluation Results ---
Root Mean Squared Error (RMSE): ₹3,667,302,995,960,731.50

Sample Predictions vs. Actual Prices:
     Actual Price        Predicted Price
917  5,500,000.00           3,540,984.57
1120 7,000,000.00 -18,811,420,797,133.72
1276 3,500,000.00           4,835,070.08
361  4,200,000.00 -18,811,423,160,574.45
1484 6,000,000.00           6,354,239.02


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# --- Helper Function for Price Conversion ---
def convert_price_to_rupees(price):
    """Converts a price string into a numerical value in Rupees."""
    if not isinstance(price, str):
        try:
            return float(price)
        except (ValueError, TypeError):
            return np.nan
    price_clean = price.replace('₹', '').strip()
    try:
        if 'Cr' in price_clean:
            return float(price_clean.replace('Cr', '').strip()) * 10000000
        elif 'L' in price_clean:
            return float(price_clean.replace('L', '').strip()) * 100000
        else:
            return float(price_clean)
    except (ValueError, TypeError):
        return np.nan

# --- Helper Function for Location Encoding ---
def encode_location_data(df):
    """Performs one-hot encoding on 'Location', dropping one category."""
    if 'Location' not in df.columns: return df
    location_dummies = pd.get_dummies(df['Location'], prefix='Location', drop_first=True)
    df = pd.concat([df, location_dummies], axis=1)
    return df

# --- Main Pipeline Function ---
def run_price_prediction_pipeline(df):
    """Executes the full data cleaning, feature engineering, training, and evaluation pipeline."""
    print("--- Starting Data Processing Pipeline ---")
    
    # Step 1: Clean Price Column
    df['Price_Rupees'] = df['Price'].apply(convert_price_to_rupees)
    df.dropna(subset=['Price_Rupees'], inplace=True)
    print(f"Step 1: Price column cleaned. {len(df)} rows remaining.")

    # Step 2: Remove Extreme Outliers
    for col in ['Price_Rupees', 'Total_Area']:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    print(f"Step 2: Outliers removed. {len(df)} rows remaining.")

    # Step 3: Handle Categorical and Missing Feature Data
    balcony_map = {'Yes': 1, 'No': 0}
    df['Balcony_Numeric'] = df['Balcony'].map(balcony_map).fillna(df['Balcony'].map(balcony_map).mode()[0])
    if 'Baths' in df.columns:
        df['Baths'] = df['Baths'].fillna(df['Baths'].mode()[0])
    print("Step 3: 'Balcony' and 'Baths' columns cleaned.")

    # Step 4: One-Hot Encode Location
    df = encode_location_data(df)
    print("Step 4: 'Location' column one-hot encoded.")

    # Step 5: Feature Selection and Log Transformation of Target
    numerical_features = ['Total_Area', 'Baths', 'Balcony_Numeric']
    location_features = [col for col in df.columns if col.startswith('Location_')]
    features = [f for f in numerical_features + location_features if f in df.columns]
    X = df[features].copy()
    
    # Apply log transformation to the target variable 'y' to handle skewness
    y = np.log1p(df['Price_Rupees'])
    X.fillna(0, inplace=True)
    print("Step 5: Features selected and Target log-transformed.")

    # Step 6: Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Step 6: Data split into {len(X_train)} training and {len(X_test)} testing samples.")

    # Step 7: Scale Numerical Features
    scaler = StandardScaler()
    X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
    X_test[numerical_features] = scaler.transform(X_test[numerical_features])
    print("Step 7: Numerical features scaled.")

    # Step 8: Train a More Stable Model (Ridge Regression)
    # alpha=1.0 is a good default for regularization strength
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    print("\n--- Model Training Complete ---")

    # Step 9: Evaluate the Model
    # The model predicts the log of the price, so we need to convert it back
    log_predictions = model.predict(X_test)
    predictions = np.expm1(log_predictions) # Inverse transform
    
    # The actual values also need to be inverse-transformed for comparison
    actual_values = np.expm1(y_test)
    
    rmse = np.sqrt(mean_squared_error(actual_values, predictions))
    
    print("\n--- Model Evaluation Results ---")
    print(f"Root Mean Squared Error (RMSE): ₹{rmse:,.2f}")
    
    print("\nSample Predictions vs. Actual Prices:")
    results_df = pd.DataFrame({'Actual Price': actual_values, 'Predicted Price': predictions}).head()
    print(results_df.to_string(formatters={'Actual Price': '{:,.2f}'.format, 'Predicted Price': '{:,.2f}'.format}))

# --- Demonstration ---
# This script assumes 'df' is a DataFrame that has already been loaded
# in your notebook (e.g., in a previous cell).
if 'df' in locals() or 'df' in globals():
    # Filter for Chennai properties
    chennai_df = df[df['Location'].str.contains('Chennai', na=False, case=False)].copy()
    if not chennai_df.empty:
        # Run the entire pipeline
        run_price_prediction_pipeline(chennai_df)
    else:
        print("No properties found for Chennai in the DataFrame.")

--- Starting Data Processing Pipeline ---
Step 1: Price column cleaned. 1595 rows remaining.
Step 2: Outliers removed. 1342 rows remaining.
Step 3: 'Balcony' and 'Baths' columns cleaned.
Step 4: 'Location' column one-hot encoded.
Step 5: Features selected and Target log-transformed.
Step 6: Data split into 1073 training and 269 testing samples.
Step 7: Numerical features scaled.

--- Model Training Complete ---

--- Model Evaluation Results ---
Root Mean Squared Error (RMSE): ₹2,622,595.50

Sample Predictions vs. Actual Prices:
     Actual Price Predicted Price
894 13,000,000.00    8,307,518.73
614  2,700,000.00    3,691,154.22
637  5,500,000.00    4,510,720.12
477  7,200,000.00    8,645,000.21
295  7,200,000.00    7,282,958.77


In [13]:
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Price_Rupees,Balcony_Numeric
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,19900000.0,1
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,22500000.0,1
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,10000000.0,0
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes,33300000.0,1
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,4800000.0,1
