## **Import Libraries**

In [1]:
import requests
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer  # Import VADER sentiment analyzer
import matplotlib.pyplot as plt

In [5]:
import nltk

nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

## **Web Scraping and Text Preprocessing**

In [6]:
# Define the URL of the news website to scrape
url = "https://www.bbc.com/news/world"

# Send an HTTP GET request to the specified URL
response = requests.get(url)

# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the title elements on the webpage
titles = soup.find_all('h3')

# Extract the text from the title elements and remove duplicates
title_texts = list(set(title.get_text() for title in titles))

# Data Preprocessing Function
def preprocess_text(text):
    # Remove special characters and extra whitespace, convert to lowercase
    processed_text = re.sub(r'[^a-zA-Z\s]', '', text).strip().lower()
    # Remove common English stopwords
    stop_words = set(stopwords.words('english'))
    processed_text = ' '.join(word for word in processed_text.split() if word not in stop_words)
    return processed_text

# Apply preprocessing to the titles, excluding the last three irrelevant ones
preprocessed_titles = [preprocess_text(title) for title in title_texts[:-3]]

# Topic Modeling Function
def perform_topic_modeling(texts, num_topics=5):
    vectorizer = CountVectorizer()
    text_matrix = vectorizer.fit_transform(texts)
    # Apply Latent Dirichlet Allocation, a topic modeling algorithm, to the text matrix
    lda_model = LatentDirichletAllocation(n_components=num_topics)
    lda_output = lda_model.fit_transform(text_matrix)
    # Retrieve the feature names from the vectorizer and generate a list of topic words for each topic
    words = vectorizer.get_feature_names_out()
    topic_word_list = []
    for topic in lda_model.components_:
        topic_words = [words[i] for i in topic.argsort()[-20:]]
        topic_word_list.append(topic_words)
    return vectorizer, lda_model, topic_word_list, lda_output

# Specify the number of topics for the LDA model
num_topics = 3

# Perform topic modeling on the preprocessed titles
vectorizer, lda_model, topic_word_list, lda_output = perform_topic_modeling(preprocessed_titles, num_topics)

# Sentiment Classification
def classify_sentiment(text):
    # Use SentimentIntensityAnalyzer from NLTK to analyze the sentiment of the text
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(text)
    # Classify the sentiment as Positive if the compound score is >= 0, otherwise as Negative
    sentiment_label = 'Positive' if sentiment_scores['compound'] >= 0 else 'Negative'
    return sentiment_label

# Political Sentiment Classification using VADER
def classify_political_sentiment(text):
    analyzer = VaderSentimentIntensityAnalyzer()
    sentiment_scores = analyzer.polarity_scores(text)
    # Classify political sentiment as "Left" if compound score is negative, "Right" if positive, "Neutral" if zero
    if sentiment_scores['compound'] < 0:
        return 'Left'
    elif sentiment_scores['compound'] > 0:
        return 'Right'
    else:
        return 'Neutral'

# Initialize lists to store the results
title_list = []
preprocessed_text_list = []
topic_list = []
sentiment_list = []
topic_percent_list = []
political_sentiment_list = []

# Process the data and append the results to the lists
for i, title in enumerate(title_texts[:-3]):  # Exclude the last three titles
    if title not in ["Get in touch", "Mobile app", "News daily newsletter"]:
        title_list.append(title)
        preprocessed_text = preprocess_text(title)
        preprocessed_text_list.append(preprocessed_text)
        # Use topic model to assign a topic to the title and calculate the highest topic score
        topic = lda_model.transform(vectorizer.transform([preprocessed_text]))
        highest_topic = np.argmax(topic) + 1
        highest_score = np.max(topic)
        topic_str = f"{highest_score:.3f}% Topic:{highest_topic}"
        # Append the topic information to the topic_list
        topic_list.append(topic_str)
        sentiment = classify_sentiment(title)
        sentiment_list.append(sentiment)
        # Append the topic percentages to the topic_percent_list
        topic_percent_list.append([round(p, 3) for p in topic[0]])
        # Classify political sentiment
        political_sentiment = classify_political_sentiment(title)
        political_sentiment_list.append(political_sentiment)

# Create a dictionary to store the data
data = {
    'Title': title_list,
    'Preprocessed Text': preprocessed_text_list,
    'Top Topic': topic_list,
    'Sentiment Label': sentiment_list,
    'Sentiment Scores': topic_percent_list,
    'Political Sentiment': political_sentiment_list
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

# Display the DataFrame
pd.set_option('display.max_colwidth', None)
print(df.to_string(index=False))


                                                    Title                                       Preprocessed Text      Top Topic Sentiment Label      Sentiment Scores Political Sentiment
         French supermarket puts up 'shrinkflation' signs             french supermarket puts shrinkflation signs 0.888% Topic:3        Positive [0.056, 0.056, 0.888]             Neutral
                    He ended the Bongo dynasty. Now what?                                     ended bongo dynasty 0.832% Topic:2        Positive [0.084, 0.832, 0.084]             Neutral
         Kim Jong Un extends Russia visit by several days           kim jong un extends russia visit several days 0.925% Topic:2        Positive [0.037, 0.925, 0.037]               Right
        Watch: Rescuers save missing dog trapped in river           watch rescuers save missing dog trapped river 0.916% Topic:2        Negative [0.042, 0.916, 0.042]                Left
  Taliban welcomes first new Chinese envoy since takeover taliban