<a href="https://colab.research.google.com/github/moinul-alam/BanglaFonts/blob/main/NLP_Assignment_Keyword_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# Import Libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
import spacy
import os

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

In [58]:
import requests
from bs4 import BeautifulSoup

def fetch_content_from_url(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the text content from the HTML
    text = soup.get_text()

    # Limit the text to the first 1000 words
    words = text.split()
    limited_words = words[:2000]

    # Join the words back into a single string
    document_text = ' '.join(limited_words)

    return document_text

# Example usage: Replace with the URL you want to scrape
url = 'https://www.bbc.com/news/articles/c390mrmxndyo'
document_text = fetch_content_from_url(url)

In [59]:
print(document_text)

Trump to be sentenced in hush money case on 10 JanuarySkip to contentBritish Broadcasting CorporationWatch LiveHomeNewsSportBusinessInnovationCultureArtsTravelEarthVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN. IrelandN. Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingVideoLiveLive NewsLive SportHomeNewsSportBusinessInnovationCultureArtsTravelEarthVideoLiveAudio

In [60]:
# Segmentation
sentences = sent_tokenize(document_text)

In [61]:
print(sentences[:5])

['Trump to be sentenced in hush money case on 10 JanuarySkip to contentBritish Broadcasting CorporationWatch LiveHomeNewsSportBusinessInnovationCultureArtsTravelEarthVideoLiveHomeNewsIsrael-Gaza WarWar in UkraineUS & CanadaUKUK PoliticsEnglandN.', 'IrelandN.', 'Ireland PoliticsScotlandScotland PoliticsWalesWales PoliticsAfricaAsiaChinaIndiaAustraliaEuropeLatin AmericaMiddle EastIn PicturesBBC InDepthBBC VerifySportBusinessExecutive LoungeTechnology of BusinessFuture of BusinessInnovationTechnologyScience & HealthArtificial IntelligenceAI v the MindCultureFilm & TVMusicArt & DesignStyleBooksEntertainment NewsArtsArts in MotionTravelDestinationsAfricaAntarcticaAsiaAustralia and PacificCaribbean & BermudaCentral AmericaEuropeMiddle EastNorth AmericaSouth AmericaWorld’s TableCulture & ExperiencesAdventuresThe SpeciaListEarthNatural WondersWeather & ScienceClimate SolutionsSustainable BusinessGreen LivingVideoLiveLive NewsLive SportHomeNewsSportBusinessInnovationCultureArtsTravelEarthVideoL

In [62]:
# Tokenization
# words = word_tokenize(document_text)
words = []
for sentence in sentences:
    words.extend(word_tokenize(sentence))

In [63]:
print(words[:10])

['Trump', 'to', 'be', 'sentenced', 'in', 'hush', 'money', 'case', 'on', '10']


In [64]:
# Remove Stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]

In [65]:
print(filtered_words[:10])

['Trump', 'sentenced', 'hush', 'money', 'case', 'JanuarySkip', 'contentBritish', 'Broadcasting', 'CorporationWatch', 'WarWar']


In [66]:
#Stemming
#stemmer = PorterStemmer()
#stemmed_words = [stemmer.stem(word) for word in filtered_words]

In [67]:
#print(stemmed_words[:10])

In [68]:
# #Lemmatization
lemmatizer = WordNetLemmatizer()
# lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
# Function to get POS tag for lemmatization
def get_wordnet_pos(word):
    from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
    tag = nltk.pos_tag([word])[0][1][0].upper()
    return {
        'J': ADJ,  # Adjective
        'V': VERB,  # Verb
        'N': NOUN,  # Noun
        'R': ADV,   # Adverb
    }.get(tag, NOUN)  # Default to noun

lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_words]


In [69]:
print(lemmatized_words[:10])

['Trump', 'sentence', 'hush', 'money', 'case', 'JanuarySkip', 'contentBritish', 'Broadcasting', 'CorporationWatch', 'WarWar']


In [70]:
# Parsing for Syntactic Analysis
nlp = spacy.load('en_core_web_sm')

def parse_sentences(sentences):
    parsed_data = []
    for sentence in sentences:
        doc = nlp(sentence)
        parsed_sentence = [
            {
                "text": token.text,
                "lemma": token.lemma_,
                "pos": token.pos_,
                "tag": token.tag_,
                "dependency": token.dep_,
            }
            for token in doc
        ]
        parsed_data.append(parsed_sentence)
    return parsed_data

parsed_sentences = parse_sentences(sentences)


In [71]:
# Display Parsing Results for the First Sentence
print("Parsed Data for the First Sentence:")
for token in parsed_sentences[0]:
    print(token)

Parsed Data for the First Sentence:
{'text': 'Trump', 'lemma': 'trump', 'pos': 'VERB', 'tag': 'VB', 'dependency': 'ROOT'}
{'text': 'to', 'lemma': 'to', 'pos': 'PART', 'tag': 'TO', 'dependency': 'aux'}
{'text': 'be', 'lemma': 'be', 'pos': 'AUX', 'tag': 'VB', 'dependency': 'auxpass'}
{'text': 'sentenced', 'lemma': 'sentence', 'pos': 'VERB', 'tag': 'VBN', 'dependency': 'xcomp'}
{'text': 'in', 'lemma': 'in', 'pos': 'ADP', 'tag': 'IN', 'dependency': 'prep'}
{'text': 'hush', 'lemma': 'hush', 'pos': 'ADJ', 'tag': 'JJ', 'dependency': 'amod'}
{'text': 'money', 'lemma': 'money', 'pos': 'NOUN', 'tag': 'NN', 'dependency': 'compound'}
{'text': 'case', 'lemma': 'case', 'pos': 'NOUN', 'tag': 'NN', 'dependency': 'pobj'}
{'text': 'on', 'lemma': 'on', 'pos': 'ADP', 'tag': 'IN', 'dependency': 'prep'}
{'text': '10', 'lemma': '10', 'pos': 'NUM', 'tag': 'CD', 'dependency': 'nummod'}
{'text': 'JanuarySkip', 'lemma': 'JanuarySkip', 'pos': 'PROPN', 'tag': 'NNP', 'dependency': 'pobj'}
{'text': 'to', 'lemma': 't

In [72]:
#Count Word Frequencies (Using Lemmatized Words)
word_freq = Counter(lemmatized_words)

In [73]:
# Display Top Keywords
top_keywords = word_freq.most_common(10)
print("\nTop Keywords and Frequencies:")
for keyword, freq in top_keywords:
    print(f"{keyword}: {freq}")


Top Keywords and Frequencies:
Trump: 16
hr: 13
case: 12
sentence: 10
agoUS: 6
order: 5
would: 5
presidential: 5
judge: 4
Justice: 4
