### page 1 url , https://www.century21.com/real-estate/chicago-il/LCILCHICAGO/?beds=1&baths=1&minsqft=200
### page 2 url , https://www.century21.com/real-estate/chicago-il/LCILCHICAGO/?beds=1&baths=1&minsqft=200&s=24
### page 3 url , https://www.century21.com/real-estate/chicago-il/LCILCHICAGO/?beds=1&baths=1&minsqft=200&s=48
### last page (139) https://www.century21.com/real-estate/chicago-il/LCILCHICAGO/?beds=1&baths=1&minsqft=200&s=3312

### and so on 
### to make it for all pages , you should notice the pattern across all pages from page 1 to page 139 as:The value of s starts at 0 for the first page and increases by 24 for each subsequent page.

# import libraries and setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL for the first page
base_url = "https://www.century21.com/real-estate/chicago-il/LCILCHICAGO/?beds=1&baths=1&minsqft=200"

# Empty lists to store data from all pages
all_beds = []
all_baths = []
all_sqft = []
all_adrs = []
all_prices = []

# Loop through all the pages (from page 1 to 139)
for offset in range(0, 3313, 24):  # range goes from 0 to 3312 with a step of 24
    # Construct the URL for each page
    if offset == 0:
        url = base_url  
    else:
        url = f"{base_url}&s={offset}"  
    
    print(f"Scraping page with offset: {offset}")
    
    # Make the request
    r = requests.get(url)
    
    # Check if the request was successful
    if r.status_code != 200:
        print(f"Failed to retrieve data for offset {offset}")
        continue
    
    # Parse the content
    soup = BeautifulSoup(r.content, "html.parser")
    
    # Find the main listings container
    total = soup.find_all("div", {"class":"main-listings row row-cols-1 row-cols-md-2 row-cols-lg-3 row-cols-xxl-4 g-3 px-3 pe-sm-0 pb-3"})
    
    # Extract data for beds, baths, square footage, address, and price
    beds = total[0].find_all("div", {"class": "property-spec beds"})
    baths = total[0].find_all("span", {"class": "specs-number full-baths"})
    sqft = total[0].find_all("div", {"class": "property-spec square-footage"})
    adrs = total[0].find_all("p", {"class": "property-address"})
    prices = total[0].find_all("div", {"class": "font-family-taglines property-price"})
    
    # Append data from the current page to the lists
    for i in range(len(beds)):
        all_beds.append(int(beds[i].text.split()[0]))
        all_baths.append(int(baths[i].text))
        all_sqft.append(int(sqft[i].text.split()[0].replace(",", "")))
        all_adrs.append(adrs[i].text.replace("\n", ""))
        
        # Handle non-numeric price values like "Price On Request"
        price_text = prices[i].text.strip().replace(",", "").replace("$", "")
        if price_text.isdigit():  # Check if it's a valid number
            all_prices.append(int(price_text))
        else:
            all_prices.append(None)  # Use None or another placeholder for invalid price values

# Create a DataFrame to store the collected data
data = {
    "bed rooms": all_beds,
    "baths": all_baths,
    "square feet": all_sqft,
    "address": all_adrs,
    "price": all_prices
}

df = pd.DataFrame(data)


Scraping page with offset: 0
Scraping page with offset: 24
Scraping page with offset: 48
Scraping page with offset: 72
Scraping page with offset: 96
Scraping page with offset: 120
Scraping page with offset: 144
Scraping page with offset: 168
Scraping page with offset: 192
Scraping page with offset: 216
Scraping page with offset: 240
Scraping page with offset: 264
Scraping page with offset: 288
Scraping page with offset: 312
Scraping page with offset: 336
Scraping page with offset: 360
Scraping page with offset: 384
Scraping page with offset: 408
Scraping page with offset: 432
Scraping page with offset: 456
Scraping page with offset: 480
Scraping page with offset: 504
Scraping page with offset: 528
Scraping page with offset: 552
Scraping page with offset: 576
Scraping page with offset: 600
Scraping page with offset: 624
Scraping page with offset: 648
Scraping page with offset: 672
Scraping page with offset: 696
Scraping page with offset: 720
Scraping page with offset: 744
Scraping page 

In [2]:
df.head()

Unnamed: 0,bed rooms,baths,square feet,address,price
0,2,2,1260,"1310 N Cleaver Street , 2 Chicago, IL 60642",429555.0
1,2,1,1600,"2422 W 83rd Street Chicago, IL 60652",239900.0
2,3,2,912,"7115 S Dobson Avenue Chicago, IL 60619",265000.0
3,2,1,1302,"9858 S Sawyer Avenue Evergreen Park, IL 60805",254900.0
4,4,2,3000,"819 N Paulina Street Unit 1N Chicago, IL 60622",899900.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3336 entries, 0 to 3335
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bed rooms    3336 non-null   int64  
 1   baths        3336 non-null   int64  
 2   square feet  3336 non-null   int64  
 3   address      3336 non-null   object 
 4   price        3322 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 130.4+ KB


In [4]:
# Save the data to a CSV file
df.to_csv("AllPages.csv", index=False)

print("Scraping complete. Data saved to AllPages.csv.")

Scraping complete. Data saved to AllPages.csv.
