In [None]:

# Install dependencies (if running for the first time)
!pip install requests html5lib beautifulsoup4 pandas numpy nltk seaborn matplotlib transformers gensim pyLDAvis spacy
!python -m nltk.downloader punkt stopwords averaged_perceptron_tagger wordnet
!python -m spacy download en_core_web_sm


In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import nltk
import re
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from gensim import corpora
from transformers import pipeline
import spacy
import pyLDAvis.gensim_models
import pyLDAvis

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:

def get_news_links(query):
    headers = {'User-Agent': 'Mozilla/5.0'}
    url = f'https://www.google.com/search?q={query}&tbm=nws'
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'][7:].split('&')[0] for a in soup.find_all('a', href=True) if 'url?q=' in a['href']]
    return links

query = "modi"
links = get_news_links(query)
df_links = pd.DataFrame(links, columns=["Links"])
df_links.head()


In [None]:

def fetch_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        return ' '.join([para.get_text() for para in paragraphs])
    except Exception as e:
        return ""

texts = [fetch_article_text(url) for url in links[:5]]  # Use top 5 links for speed
df = pd.DataFrame({'url': links[:5], 'text': texts})
df.head()


In [None]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return tokens

df['tokens'] = df['text'].apply(preprocess)
df.head()


In [None]:

nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

df['entities'] = df['text'].apply(extract_entities)
df['pos_tags'] = df['tokens'].apply(nltk.pos_tag)
df[['entities', 'pos_tags']].head()


In [None]:

sentiment_pipeline = pipeline("sentiment-analysis")

def get_sentiment(text):
    try:
        result = sentiment_pipeline(text[:512])[0]
        return result['label'], result['score']
    except:
        return None, None

df[['sentiment', 'score']] = df['text'].apply(lambda x: pd.Series(get_sentiment(x)))
df[['text', 'sentiment', 'score']].head()


In [None]:

dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)

topics = lda_model.print_topics()
for topic in topics:
    print(topic)


In [None]:

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis


In [None]:

sns.countplot(data=df, x='sentiment')
plt.title("Sentiment Distribution of Articles")
plt.show()
