In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from textblob import TextBlob
def extract_article_title_and_text(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse the content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the title
        title_element = soup.find('h1')
        title = title_element.get_text().strip() if title_element else 'No Title Found'
        
        # Extract the main article text
        content_div = soup.find('div', class_='td-post-content tagdiv-type')
        if content_div:
            article_text = content_div.getText()
        else:
            article_text = 'No Content Found'
        
        return title, article_text

    except Exception as e:
        print(f"Error extracting article from {url}: {e}")
        return None, None     
    
def remove_url_gmails_no(text):
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    # Remove URLs (http, https, www)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

def spelling_correction(text):
    return TextBlob(text).correct().string
# Load the input data
url_data = pd.read_excel('/Users/mnu/Desktop/NLP_task/Provided_data/Input.xlsx')
dataset = []

# Process each URL
for index, row in url_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, article_text = extract_article_title_and_text(url)
        text = title + '\n' +article_text
        # Process the text
        cleaned_text = remove_url_gmails_no(article_text)
        corrected_text = spelling_correction(cleaned_text).lower()
        if title is not None and article_text is not None:
            dataset.append({'URL_ID': url_id,'article_text': corrected_text})
        else:
            print(f"Skipping URL ID {url_id} due to extraction issues.")
    except Exception as e:
        print(f"Error processing URL ID {url_id}: {e}")
Dataset = pd.DataFrame(dataset)
Dataset.to_csv('/Users/mnu/Desktop/NLP_task/Web_scraping/web_scraped_data.csv')