# For easily-scrappable company Websites

In [6]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import time

# Load the dataset with error handling for bad lines
dataset = pd.read_csv("dataset.csv", encoding="ISO-8859-1", on_bad_lines='skip')
dataset = dataset.drop(columns=['Unnamed: 2'], errors='ignore')

# Function to ensure URLs have the 'http://' scheme
def ensure_url_scheme(df, url_column):
    for index, row in df.iterrows():
        url = row[url_column]
        if not url.startswith('https://') :
            url = 'https://' + url
        row[url_column] = url

# Apply the function to the dataset
ensure_url_scheme(dataset, 'url')

# Function to scrape data from a company website
def scrape_company_data(company):
    url = company["url"]
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    attempts = 3  # Number of attempts to fetch the URL
    for attempt in range(attempts):
        try:
            print(f"Accessing URL: {url} (Attempt {attempt + 1})")  # Debug print for URLs
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raises an HTTPError for bad responses (4xx, 5xx)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Placeholder for the actual scraping logic
            manufacturer = "Yes" if "manufacturer" in soup.text.lower() else "No"
            brand = "Yes" if "brand" in soup.text.lower() else "No"
            distributor = "Yes" if "distributor" in soup.text.lower() else "No"
            relevant = "Yes"
            category = "Bulk (Manufacturer)" if manufacturer == "Yes" else "Bulk (Distributor)" if distributor == "Yes" else "Brand"
            
            # Fill health segments based on specific keywords found on the website
            probiotics = "Yes" if "probiotic" in soup.text.lower() else "No"
            fortification = "Yes" if "fortified" in soup.text.lower() else "No"
            gut_health = "Yes" if "gut health" in soup.text.lower() else "No"
            womens_health = "Yes" if "women's health" in soup.text.lower() else "No"
            cognitive_health = "Yes" if "cognitive health" in soup.text.lower() else "No"
            
            return {
                "Company": company["company_name"],
                "Website": url,
                "Relevant": relevant,
                "Category": category,
                "Manufacturer": manufacturer,
                "Brand": brand,
                "Distributor": distributor,
                "F&B": "Yes",  # Assuming these are all F&B
                "Probiotics": probiotics,
                "Fortification": fortification,
                "Gut Health": gut_health,
                "Womens Health": womens_health,
                "Cognitive Health": cognitive_health
            }

        except requests.exceptions.HTTPError as e:
            if response.status_code == 403:
                print(f"Access denied for {company['company_name']}: {e}. Retrying...")
            else:
                print(f"Error processing {company['company_name']}: {e}")
            time.sleep(1)  # Wait before retrying

        except requests.exceptions.RequestException as e:
            print(f"Error processing {company['company_name']}: {e}")
            return None
    return None  # Return None if all attempts fail

# List to hold the results
results = []

# Scrape data for each company in the dataset
for index, company in dataset.iterrows():
    print(f"Processing {company['company_name']}...")  # Debug print for company being processed
    data = scrape_company_data(company)
    if data:
        results.append(data)
    
    # Add a delay to avoid overwhelming the servers
    # time.sleep(2)  # Uncomment this line to add a delay between requests

# Create a DataFrame from the results
df = pd.DataFrame(results)

# Save results to CSV
csv_file = 'company_data.csv'
df.to_csv(csv_file, index=False)  # Write mode

print(f"Data scraping completed. Results saved to '{csv_file}'.")


Processing Nestle...
Accessing URL: https://www.nestle.com (Attempt 1)
Processing Dr. Reddy's Laboratories...
Accessing URL: https://www.drreddys.com (Attempt 1)
Processing Coca...
Accessing URL: https://www.coca-colacompany.com (Attempt 1)
Processing Pfizer...
Accessing URL: https://www.pfizer.com (Attempt 1)
Processing PepsiCo...
Accessing URL: https://www.pepsico.com (Attempt 1)
Processing Johnson & Johnson...
Accessing URL: https://www.jnj.com (Attempt 1)
Processing Danone...
Accessing URL: https://www.danone.com (Attempt 1)
Processing General Mills...
Accessing URL: https://www.generalmills.com (Attempt 1)
Processing GlaxoSmithKline (GSK)...
Accessing URL: https://www.gsk.com (Attempt 1)
Processing Merck & Co....
Accessing URL: https://www.merck.com (Attempt 1)
Processing Unilever...
Accessing URL: https://www.unilever.com (Attempt 1)
Processing Roche...
Accessing URL: https://www.roche.com (Attempt 1)
Processing Nestle Waters...
Accessing URL: https://www.nestlewaters.com (Attemp

# Selenium for 403 Error sites

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Set up the options for the Chrome browser
options = Options()
options.headless = False  # Change to True if you want to run headless
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

data = {
    "Company": [],
    "Website": [],
    "Relevant": [],
    "Category": [],
    "Manufacturer": [],
    "Brand": [],
    "Distributor": [],
    "F&B": [],
    "Probiotics": [],
    "Fortification": [],
    "Gut Health": [],
    "Womens Health": [],
    "Cognitive Health": []
}

companies = [
    {'name': 'bayer', 'url': 'https://www.bayer.com'},
    {'name': 'lilly', 'url': 'https://www.lilly.com'},
    {'name': 'abbvie', 'url': 'https://www.abbvie.com'},
    {'name': 'medline', 'url': 'https://www.medline.com'},
]

# Function to scrape website data
def scrape_website(company):
    try:
        driver.get(company["url"])

        # Use WebDriverWait to wait for the page to load (adjust timeout as necessary)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'body'))
        )
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract relevant information
        manufacturer = "Yes" if "manufacturer" in soup.text.lower() else "No"
        brand = "Yes" if "brand" in soup.text.lower() else "No"
        distributor = "Yes" if "distributor" in soup.text.lower() else "No"
        f_and_b = "Yes" if "food" in soup.text.lower() and "beverage" in soup.text.lower() else "No"
        probiotics = "Yes" if "probiotic" in soup.text.lower() else "No"
        fortification = "Yes" if "fortified" in soup.text.lower() else "No"
        gut_health = "Yes" if "gut health" in soup.text.lower() else "No"
        womens_health = "Yes" if "women's health" in soup.text.lower() else "No"
        cognitive_health = "Yes" if "cognitive health" in soup.text.lower() else "No"

        # Determine category
        category = "N/A"
        if manufacturer == "Yes":
            category = "Bulk (Manufacturer)"
        elif distributor == "Yes":
            category = "Bulk (Distributor)"
        elif brand == "Yes":
            category = "Brand"
        else:
            category = "F&B"

        # Append data to the data structure
        data["Company"].append(company["name"])
        data["Website"].append(company["url"])
        data["Relevant"].append("Yes")
        data["Category"].append(category)
        data["Manufacturer"].append(manufacturer)
        data["Brand"].append(brand)
        data["Distributor"].append(distributor)
        data["F&B"].append(f_and_b)
        data["Probiotics"].append(probiotics)
        data["Fortification"].append(fortification)
        data["Gut Health"].append(gut_health)
        data["Womens Health"].append(womens_health)
        data["Cognitive Health"].append(cognitive_health)

    except Exception as e:
        print(f"Error processing {company['name']}: {e}")

# Scrape all company websites
for company in companies:
    scrape_website(company)
    time.sleep(random.uniform(2, 5))  # Random sleep between requests

# Create a DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv("test.csv", index=False)
driver.quit()
print("Data scraping completed. Results saved to 'test.csv'.")


Data scraping completed. Results saved to 'test.csv'.


# Merging the Two result csv's

In [13]:
import pandas as pd

# Load the datasets
company_data = pd.read_csv("company_data.csv")
test = pd.read_csv("test.csv")

# Merge the datasets
merged_data = pd.concat([company_data, test], ignore_index=True)

# Save the merged dataset to a new CSV file
merged_data.to_excel("result.xlsx", index=False)

print("Data merging completed. Results saved to 'result.xlsx'.")


Data merging completed. Results saved to 'result.xlsx'.
