In [1]:
import numpy as np
import pandas as pd
import requests
import threading
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

In [2]:
# Headers to mimic a browser
HEADERS = ({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://www.flipkart.com/',
    'DNT': '1',  # Do Not Track Request Header
})

END = 50

MAX_THREADS = 5

In [3]:
# Scrape a single page
def scrape_page(page_num):
    URL = f'https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page_num}'
    try:
        response = requests.get(URL, headers=HEADERS, timeout=10)
        print(f"Scraped {URL} with response code: {response.status_code}")
        soup = BeautifulSoup(response.text, 'lxml')

        products = []

        items = soup.find_all('div', class_='tUxRFH')
        for item in items:
            title_tag = item.find('div', class_='KzDlHZ') # Title
            price_tag = item.find('div', class_='Nx9bqj _4b5DiR') # Price
            rating_tag = item.find('div', class_='XQDdHH') # Rating

            # Description
            description = None
            description_outer = item.find('ul', class_='G4BRas')
            if description_outer:
                description = ', '.join(el.text for el in description_outer.find_all('li', class_='J+igdf'))

            # Review
            review = None
            review_outer = item.find("span", class_="Wphh3N")
            if review_outer:
                review_inner = review_outer.find("span")
                if review_inner:
                    review_span = review_inner.find_all("span")
                    if len(review_span) > 2:
                        review = review_span[2].text.strip()

            # Text extraction
            title = title_tag.text.strip() if title_tag else None
            price = price_tag.text.strip() if price_tag else None
            rating = rating_tag.text.strip() if rating_tag else None

            if title and description and price and rating and review:
                products.append({
                    'Title': title,
                    'Description': description,
                    'Price': price,
                    'Rating': rating,
                    'Review': review
                })

        return products

    except Exception as e:
        print(f"[Error on page {page_num}] {e}")
        return []


In [4]:
# Scrape all pages with threads
def scrape_all_pages(start=1, end=END, max_threads=MAX_THREADS):
    all_products = []

    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(scrape_page, i) for i in range(start, end + 1)]

        for future in as_completed(futures):
            page_data = future.result()
            all_products.extend(page_data)

    return all_products


In [5]:
# Save results to CSV
def save_to_csv(data, filename='flipkart_iphones.csv', folder='flipkart_data'):
    # Make sure the folder exists
    os.makedirs(folder, exist_ok=True)
    
    # Full path to save the file
    filepath = os.path.join(folder, filename)
    
    # Save CSV
    df = pd.DataFrame(data)
    df.to_csv(filepath, index=False, encoding='utf-8')
    print(f"Saved to {filepath}")


In [6]:
# Run the scraper
if __name__ == '__main__':
    print("Scraping Flipkart...")
    results = scrape_all_pages(start=1, end=END, max_threads=MAX_THREADS)
    print(f"Scraped {len(results)} products.")
    
    save_to_csv(results)
    print("Saved to flipkart_iphones.csv")


Scraping Flipkart...
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=1 with response code: 200
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=2 with response code: 200
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=3 with response code: 200
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=5 with response code: 200
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=4 with response code: 200
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=6 with response code: 200
Scraped https://www.flipkart.com/search?q=iphone&otracker=search&ot