<a href="https://colab.research.google.com/github/martin254/OOP-Project/blob/master/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Kenyan WallStreet Scraper using BeautifulSoup

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_article_content(article_url):
    """
    Fetches the content of an article from its URL.

    Parameters:
        article_url (str): The URL of the article to fetch.

    Returns:
        str: The extracted content of the article, or an empty string if content is unavailable.
    """
    try:
        response = requests.get(article_url)
        if response.status_code != 200:
            print(f"Failed to fetch article content from {article_url}. HTTP Status: {response.status_code}")
            return ""

        # Parse the article page
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the article content
        content_div = soup.find("div", class_="content-inner")
        if content_div:
            paragraphs = content_div.find_all("p")  # Extract all <p> tags within the content-inner div
            content = " ".join([para.get_text(strip=True) for para in paragraphs])
            return content
        else:
            print(f"Article content not found for URL: {article_url}")
            return ""
    except Exception as e:
        print(f"Error fetching article content: {e}")
        return ""

def scrape_kenyan_wall_street(company_name):
    """
    Scrapes headlines, URLs, publication dates, and article content for a given company from Kenyan Wall Street.

    Parameters:
        company_name (str): Name of the company to search for.

    Returns:
        list: A list of dictionaries containing scraped data.
    """
    base_url = "https://kenyanwallstreet.com"
    search_url = f"{base_url}/?s={company_name.replace(' ', '+')}"
    all_data = []

    # Get the total number of pages dynamically
    print(f"Fetching total page count for {company_name}...")
    response = requests.get(search_url)
    if response.status_code != 200:
        print(f"Failed to fetch the search page for {company_name}. HTTP Status: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the total number of pages
    total_pages_tag = soup.find("span", class_="page_info")
    try:
        total_pages = int(total_pages_tag.get_text(strip=True).split()[-1]) if total_pages_tag else 1
    except (ValueError, AttributeError):
        total_pages = 1

    print(f"Total pages found: {total_pages}")

    # Iterate through all pages
    for page in range(1, total_pages + 1):
        if page == 1:
            url = search_url  # Page 1 uses the base search URL
        else:
            url = f"{base_url}/page/{page}/?s={company_name.replace(' ', '+')}"

        print(f"Scraping Page {page}: {url}")
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch data from {url}. HTTP Status: {response.status_code}")
            break

        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all articles based on the structure
        articles = soup.find_all("h3", class_="jeg_post_title")

        for article in articles:
            try:
                # Extract headline
                title = article.find("a").get_text(strip=True)
                # Extract URL
                link = article.find("a")["href"]
                # Extract publication date
                date_tag = article.find_next("div", class_="jeg_meta_date")
                date = date_tag.get_text(strip=True) if date_tag else "Date not found"
                # Fetch the article content
                content = get_article_content(link)

                # Append to the data list
                all_data.append({
                    "Company": company_name,
                    "Title": title,
                    "URL": link,
                    "Date": date,
                    "Content": content
                })
            except AttributeError:
                # Skip if any key element is missing
                continue

    return all_data

# List of companies to scrape
companies = [
    "Absa Bank Kenya Plc", "Bamburi Cement Ltd", "Britam Holdings Plc",
    "British American Tobacco Kenya Plc", "Centum Investment Company", "Cooperative Bank of Kenya","BK Group Plc",
    "BOC Kenya Plc","Car and General (K) Ltd",""
]

# Scrape data for all companies
all_results = []

for company in companies:
    print(f"Starting scrape for: {company}")
    results = scrape_kenyan_wall_street(company)
    all_results.extend(results)

# Save the data to a CSV file
df = pd.DataFrame(all_results)
df.to_csv("kenyan_wall_street_with_content.csv", index=False)

print("Scraping complete. Data saved to 'kenyan_wall_street_with_content.csv'.")
