In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import re
from selenium.webdriver.common.keys import Keys


def setup_driver():
    
    driver_path = 'C:/Users/Lenovo/Downloads/geckodriver-v0.34.0-win32/geckodriver.exe'

    
    firefox_binary_path = 'C:/Program Files/Mozilla Firefox/firefox.exe'

    service = FirefoxService(executable_path=driver_path)
    options = FirefoxOptions()
    options.headless = True  

   
    options.binary_location = firefox_binary_path

    return webdriver.Firefox(service=service, options=options)


def extract_data(soup):
    data = []

    property_containers = soup.find_all('div', class_='mb-srp__list')

    for container in property_containers:
        try:
            title_element = container.find('h2', class_='mb-srp__card--title')
            if title_element:
                project_info = title_element.text.strip()
                match = re.match(r'\d+\sBHK Flat\s+for\s+Sale\s+in\s+(.+)', project_info)
                if match:
                    project = match.group(1).strip()

                   
                    location = ', '.join(project.split(', ')[1:])
                    
                    project = project.split(', ')[0]

                   
                    bhk = None
                    bhk_match = re.search(r'(\d+)\sBHK', project_info)
                    if bhk_match:
                        bhk = bhk_match.group(1)

                    total_floors_element = container.find('span', {'data-summary': 'floor'})
                    total_floors = total_floors_element.find('div', class_='mb-srp__card__summary--value').text.strip() if total_floors_element else None

                    carpet_area_element = container.find('div', {'data-summary': 'carpet-area'})
                    carpet_area = carpet_area_element.find('div', class_='mb-srp__card__summary--value').text.strip() if carpet_area_element else None

                    super_area_element = container.find('div', {'data-summary': 'super-area'})
                    super_area = super_area_element.find('div', class_='mb-srp__card__summary--value').text.strip() if super_area_element else None

                    price_element = container.find('div', class_='mb-srp__card__price--amount')
                    price = price_element.text.replace('₹', '').strip() if price_element else None

                    rate_element = container.find('div', class_='mb-srp__card__price--size')
                    rate = rate_element.text.replace('₹', '').strip() if rate_element else None

                    # Handling different HTML structures for developer_name
                    developer_name_element = container.find('span', class_='mb-srp__card__developer--name--highlight')
                    if developer_name_element:
                        developer_name = developer_name_element.text.strip()
                    else:
                        society_name_element = container.find('a', class_='mb-srp__card__society--name')
                        developer_name = society_name_element.text.strip() if society_name_element else "Unknown"

                    
                    data.append({
                        'project': project,
                        'location': location,
                        'bhk': bhk,
                        'total_floors': total_floors,
                        'carpet_area': carpet_area,
                        'super_area': super_area,
                        'price': price,
                        'rate': rate,
                        'developer_name': developer_name,
                    })

        except Exception as e:
            print(f"Error extracting data: {e}")

    return data



def scrape_and_save_data(driver, url, target_records, output_path):
    data = []
    current_records = 0
    page_number = 1

    while current_records < target_records:
        try:
            current_url = f"{url}&page={page_number}"
            driver.get(current_url)

            
            wait = WebDriverWait(driver, 120)
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'mb-srp__card__container')))

           
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            extracted_data = extract_data(soup)

            if not extracted_data:
                break  # Break if no more records are found on the current page

            
            current_records += len(extracted_data)

            
            data.extend(extracted_data)

            print(f"Scraped {current_records} records from page {page_number}")
            page_number += 1

            
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)  # Add a delay to let the page load

        except Exception as e:
            print(f"Error: {e}")
            print(f"Retrying page {page_number}...")
            time.sleep(5)
            
            continue

    
    df = pd.DataFrame(data)

   
    df['location'] = df['project'] + ', ' + df['location']

    
    df.drop(columns=['project'], inplace=True)

   
    df = df[['location', 'bhk', 'total_floors', 'carpet_area', 'super_area', 'price', 'rate', 'developer_name']]

    
    df.to_excel(output_path, index=False)

   
    print(df)



url = "https://www.magicbricks.com/flats-in-pune-for-sale-pppfs?mbtracker=google_paid_brand_sitelink_pune&cCode=sem_brand_sitelink&gclid=Cj0KCQiAtOmsBhCnARIsAGPa5yaBq3zdZaTU7DVD_bDEeLInDGq7qKAX9CgpiR-3ZKE8dijMEZV7VeIaAsChEALw_wcB"
target_records = 3500
output_path = 'C:/Users/Lenovo/Downloads/MohitGanvir_99acres.xlsx'

driver = setup_driver()
scrape_and_save_data(driver, url, target_records, output_path)


driver.quit()


Scraped 30 records from page 1
Scraped 60 records from page 2
Scraped 90 records from page 3
Scraped 120 records from page 4
Scraped 150 records from page 5
Scraped 180 records from page 6
Scraped 210 records from page 7
Scraped 240 records from page 8
Scraped 270 records from page 9
Scraped 300 records from page 10
Scraped 330 records from page 11
Scraped 360 records from page 12
Scraped 390 records from page 13
Scraped 420 records from page 14
Scraped 449 records from page 15
Scraped 479 records from page 16
Scraped 508 records from page 17
Scraped 538 records from page 18
Scraped 568 records from page 19
Scraped 598 records from page 20
Scraped 628 records from page 21
Scraped 658 records from page 22
Scraped 687 records from page 23
Scraped 717 records from page 24
Scraped 747 records from page 25
Scraped 777 records from page 26
Scraped 807 records from page 27
Scraped 837 records from page 28
Scraped 867 records from page 29
Scraped 897 records from page 30
Scraped 927 records fr

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


data = pd.read_excel('C:/Users/Lenovo/Downloads/MohitGanvir_99acres.xlsx')



data['price'] = data['price'].apply(lambda x: float(x.split()[0]) * 10000000 if 'Cr' in x else float(x.split()[0]) * 100000)

data['total_floors'] = data['total_floors'].fillna(data['total_floors'].mean())
data['carpet_area'] = data['carpet_area'].str.extract('(\d+)').astype(float)
data['super_area'] = data['super_area'].str.extract('(\d+)').astype(float)
data['super_area'] = np.where(data['super_area'].isnull(), data['carpet_area'], data['super_area'])
data['price'] = data['price'].fillna(data['price'].mean())

data['rate'] = data['rate'].replace('[^\d.]', '', regex=True).astype(float)

data['bhk'] = data['bhk'].astype(float)

le = LabelEncoder()
data['project_encoded'] = le.fit_transform(data['developer_name'])


imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(data[['total_floors', 'carpet_area', 'super_area', 'rate', 'bhk', 'project_encoded']])
target = data['price']


X_train, X_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.2, random_state=42)


linear_reg_model = LinearRegression()
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

models = [linear_reg_model, random_forest_model, gradient_boosting_model]

predictions_columns = []

for model in models:
    
    model.fit(features_imputed, target)
    
    
    predictions = model.predict(features_imputed)
    
    
    col_name = f'{model.__class__.__name__}_Prediction'
    data[col_name] = predictions
    predictions_columns.append(col_name)


output_file_path = 'C:/Users/Lenovo/Downloads/MohitGanvir_99acres_with_predictions.xlsx'
data.to_excel(output_file_path, index=False)

print(f"Predictions attached to the input file. Saved to {output_file_path}")
