## WEB Scraping 

In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import pandas as pd
import time
import random

# --------- SETUP CHROME DRIVER ---------
options = Options()
# Comment this line if you want to see the browser for debugging
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
chrome_driver_path = r"C:\Users\dhana\Downloads\chromedriver-win64\chromedriver.exe"
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
driver.set_page_load_timeout(30)

# --------- GENERIC SEARCH TERMS ---------
generics = [
    "montair lc",
]

results = []

# --------- START SCRAPING ---------
for generic in generics:
    print(f"\n🔍 Searching for: {generic}")
    search_url = f"https://www.1mg.com/search/all?name={generic}"
    
    try:
        driver.get(search_url)
    except TimeoutException:
        print(f"⏱️ Timeout on search page for {generic}")
        continue

    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "a[href^='/drugs/']"))
        )
    except TimeoutException:
        print(f"⚠️ No results found quickly for: {generic}")
        continue

    time.sleep(random.uniform(2, 4))

    product_links = []
    items = driver.find_elements(By.CSS_SELECTOR, "a[href^='/drugs/'], a[href^='/otc/']")

    for item in items:
        try:
            href = item.get_attribute("href")
            if href:
                product_links.append(href)
                break  # Stop after first valid link
        except:
            continue

    print(f"🔗 Found {len(product_links)} product links for {generic}")
    # Optional: print(product_links)

    for url in product_links:
        try:
            driver.get(url)
        except TimeoutException:
            print(f"⏱️ Timeout for product URL: {url}")
            continue
        except WebDriverException as e:
            print(f"❌ WebDriver error: {e}")
            continue

        time.sleep(random.uniform(2, 4))

        # Extract brand name
        try:
            brand_name = driver.find_element(By.CLASS_NAME, "DrugHeader__title-content___2ZaPo").text
        except:
            brand_name = "N/A"

        # Extract manufacturer
        try:
            manufacturer = driver.find_element(By.XPATH, "//div[contains(text(),'Marketer')]/following-sibling::div").text
        except:
            manufacturer = "N/A"

        # Extract salt composition
        try:
            salt = driver.find_element(By.XPATH, "//div[contains(text(),'SALT COMPOSITION')]/following-sibling::div").text
        except:
            salt = "N/A"

        # Extract price
        try:
            price = driver.find_element(By.CSS_SELECTOR, "span.PriceBoxPlanOption__offer-price___3v9x8").text
        except:
            price = "N/A"

        # Extract uses
        try:
            uses_ul = driver.find_elements(By.CSS_SELECTOR, "#uses_and_benefits li")
            uses = ', '.join([li.text for li in uses_ul]) if uses_ul else "N/A"
        except:
            uses = "N/A"

        # Extract side effects
        try:
            side_effects_ul = driver.find_elements(By.CSS_SELECTOR, "#side_effects ul li")
            side_effects = ', '.join([li.text for li in side_effects_ul]) if side_effects_ul else "N/A"
        except:
            side_effects = "N/A"

        results.append({
            "Generic Name": generic,
            "Brand Name": brand_name,
            "Price": price,
            "Manufacturer": manufacturer,
            "Salt": salt,
            "Uses": uses,
            "Side Effects": side_effects,
            "URL": url
        })

        print(f"✅ Collected: {brand_name}")
        time.sleep(random.uniform(2, 4))

# --------- SAVE TO CSV ---------
df = pd.DataFrame(results)
df.to_csv("1mg_medicine_data_full.csv", index=False, encoding='utf-8-sig')
print("\n✅ Data saved to 1mg_medicine_data_Set1.csv")

driver.quit()



🔍 Searching for: montair lc
🔗 Found 1 product links for montair lc
✅ Collected: Montair-LC Tablet

✅ Data saved to 1mg_medicine_data_Set1.csv


In [3]:
pip install selenium

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
     ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
     ---------------------------------------- 0.1/9.4 MB 1.7 MB/s eta 0:00:06
     ---------------------------------------- 0.1/9.4 MB 1.7 MB/s eta 0:00:06
     ---------------------------------------- 0.1/9.4 MB 1.7 MB/s eta 0:00:06
      --------------------------------------- 0.2/9.4 MB 807.1 kB/s eta 0:00:12
     - -------------------------------------- 0.2/9.4 MB 942.1 kB/s eta 0:00:10
     - -------------------------------------- 0.2/9.4 MB 942.1 kB/s eta 0:00:10
     - -------------------------------------- 0.3/9.4 MB 884.2 kB/s eta 0:00:11
     - -------------------------------------- 0.3/9.4 MB 884.2 kB/s eta 0:00:11
     - -------------------------------------- 0.4/9.4 MB 859.0 kB/s eta 0:00:11
     - -------------------------------------- 0.4/9.4 MB 859.0 kB/s 


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import pandas as pd
import time
import random

# --------- SETUP CHROME DRIVER ---------
options = Options()
options.add_argument('--headless')  # Comment this out to see the browser
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
chrome_driver_path = r"C:\Users\dhana\Downloads\chromedriver-win64\chromedriver.exe"
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
driver.set_page_load_timeout(30)

# --------- USER INPUT ---------
medicine_name = input("🔍 Enter the medicine name to search: ").strip()

results = []

print(f"\n🔍 Searching for: {medicine_name}")
search_url = f"https://www.1mg.com/search/all?name={medicine_name}"

try:
    driver.get(search_url)
except TimeoutException:
    print(f"⏱️ Timeout on search page for {medicine_name}")
    driver.quit()
    exit()

try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a[href^='/drugs/'], a[href^='/otc/']"))
    )
except TimeoutException:
    print(f"⚠️ No results found quickly for: {medicine_name}")
    driver.quit()
    exit()

time.sleep(random.uniform(2, 4))

product_links = []
items = driver.find_elements(By.CSS_SELECTOR, "a[href^='/drugs/'], a[href^='/otc/']")

for item in items:
    try:
        href = item.get_attribute("href")
        if href:
            product_links.append(href)
            break  # Limit to just one product
    except:
        continue

print(f"🔗 Found {len(product_links)} product link(s) for {medicine_name}")

for url in product_links:
    try:
        driver.get(url)
    except TimeoutException:
        print(f"⏱️ Timeout for product URL: {url}")
        continue
    except WebDriverException as e:
        print(f"❌ WebDriver error: {e}")
        continue

    time.sleep(random.uniform(2, 4))

    # Extract brand name
    try:
        brand_name = driver.find_element(By.CLASS_NAME, "DrugHeader__title-content___2ZaPo").text
    except:
        brand_name = "N/A"

    # Extract manufacturer
    try:
        manufacturer = driver.find_element(By.XPATH, "//div[contains(text(),'Marketer')]/following-sibling::div").text
    except:
        manufacturer = "N/A"

    # Extract salt composition
    try:
        salt = driver.find_element(By.XPATH, "//div[contains(text(),'SALT COMPOSITION')]/following-sibling::div").text
    except:
        salt = "N/A"

    # Extract price
    try:
        price = driver.find_element(By.CSS_SELECTOR, "span.PriceBoxPlanOption__offer-price___3v9x8").text
    except:
        price = "N/A"

    # Extract uses
    try:
        uses_ul = driver.find_elements(By.CSS_SELECTOR, "#uses_and_benefits li")
        uses = ', '.join([li.text for li in uses_ul]) if uses_ul else "N/A"
    except:
        uses = "N/A"

    # Extract side effects
    try:
        side_effects_ul = driver.find_elements(By.CSS_SELECTOR, "#side_effects ul li")
        side_effects = ', '.join([li.text for li in side_effects_ul]) if side_effects_ul else "N/A"
    except:
        side_effects = "N/A"

    results.append({
        "Medicine Name": medicine_name,
        "Brand Name": brand_name,
        "Price": price,
        "Manufacturer": manufacturer,
        "Salt": salt,
        "Uses": uses,
        "Side Effects": side_effects,
        "URL": url
    })

    print(f"✅ Collected data for: {brand_name}")
    time.sleep(random.uniform(2, 4))

# --------- SAVE TO CSV ---------
if results:
    df = pd.DataFrame(results)
    output_file = f"{medicine_name.replace(' ', '_')}_info.csv"
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"\n✅ Data saved to {output_file}")
else:
    print("⚠️ No data collected.")

driver.quit()



🔍 Searching for: Dolo 650
🔗 Found 1 product link(s) for Dolo 650
✅ Collected data for: Dolo 650 Tablet

✅ Data saved to Dolo_650_info.csv


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

# --------- SETUP DRIVER ---------
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")

chrome_driver_path = r"C:\Users\dhana\Downloads\chromedriver-win64\chromedriver.exe"
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

# --------- INPUT DRUG URL ---------
url = "https://www.1mg.com/drugs/montair-lc-tablet-565306"
driver.get(url)
time.sleep(3)

# --------- SCRAPE USES ---------
try:
    uses_list = driver.find_elements(By.CSS_SELECTOR, "#uses_and_benefits li")
    uses = ', '.join([li.text for li in uses_list]) if uses_list else "N/A"
except:
    uses = "N/A"

# --------- SCRAPE SIDE EFFECTS ---------
try:
    side_effects_list = driver.find_elements(By.CSS_SELECTOR, "#side_effects ul li")
    side_effects = ', '.join([li.text for li in side_effects_list]) if side_effects_list else "N/A"
except:
    side_effects = "N/A"

# --------- SAVE TO CSV ---------
df = pd.DataFrame([{
    "Medicine Name": "Montair LC",
    "Uses": uses,
    "Side Effects": side_effects,
    "URL": url
}])

df.to_csv("montair_lc_uses_side_effects.csv", index=False, encoding='utf-8-sig')
print("✅ Data saved to montair_lc_uses_side_effects.csv")

driver.quit()


✅ Data saved to montair_lc_uses_side_effects.csv
