In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [18]:
def fetch_article_body(url):
    try:
        # Send a GET request to fetch the HTML content of the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for unsuccessful status codes
        soup = BeautifulSoup(response.text, 'html.parser')

        # Locate the section where the article body starts
        content_div = soup.find('div', class_='td-post-content tagdiv-type')

        # Extract relevant elements (headings, paragraphs, and lists)
        if content_div:
            elements = content_div.find_all(['h1', 'h2', 'h3', 'p', 'ol'])
            article_content = []

            # Process each element
            for elem in elements:
                if elem.name == 'ol':  # Handle ordered lists
                    for li in elem.find_all('li'):
                        article_content.append(f"- {li.get_text()}")
                else:
                    article_content.append(elem.get_text())

            return article_content  # Return the list of paragraphs
        else:
            print("Content div not found")
            return []
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

In [19]:
df = pd.read_excel('input.xlsx')  # Assuming the file has a column with URLs

In [20]:
urls = df['URL'].tolist()

In [21]:
df.head(1)

Unnamed: 0,URL_ID,URL
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...


In [22]:
articles_data = []  # List to store articles' paragraphs

In [23]:
for url in urls:
    paragraphs = fetch_article_body(url)
    articles_data.append(paragraphs)


In [24]:
max_paragraphs = max(len(article) for article in articles_data)
column_names = [f'Paragraph {i+1}' for i in range(max_paragraphs)]
articles_df = pd.DataFrame(columns=column_names)

In [25]:
rows = []
for i, article in enumerate(articles_data):
    row = {f'Paragraph {j+1}': article[j] if j < len(article) else 0 for j in range(max_paragraphs)}
    rows.append(row)


In [26]:
articles_df = pd.DataFrame(rows, columns=column_names)

In [27]:
final_df = pd.concat([df, articles_df], axis=1)

In [28]:
final_df.to_excel('output_with_paragraphs.xlsx', index=False)