<a href="https://colab.research.google.com/github/p-tech/BDAI-Files/blob/main/Page_Topic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# prompt: add a function to suggest a topic for the page using the keywords even if using ChatGPT to do this

import requests
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from nltk.probability import FreqDist
import pandas as pd


def scrape_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = " ".join(p.get_text(strip=True) for p in soup.find_all('p'))  # Combine all paragraph text
    return text


nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [
        token.lower() for token in tokens
        if token.lower() not in stop_words and token not in string.punctuation
    ]

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    stemmed_tokens =  filtered_tokens
    return stemmed_tokens


def get_main_keywords(tokens, n=20):
    freq_dist = FreqDist(tokens)
    main_keywords = freq_dist.most_common(n)
    return main_keywords


def keywords_to_dataframe(keywords):
    df = pd.DataFrame(keywords, columns=['keyword', 'frequency'])
    return df

def suggest_topic(keywords):
    # Basic topic suggestion based on keywords (can be improved)
    topic = ", ".join([keyword for keyword, freq in keywords[:3]])
    return f"Suggested Topic: {topic}"

def analyze_article(url):
    text = scrape_article(url)
    preprocessed_text = preprocess_text(text)
    main_keywords = get_main_keywords(preprocessed_text)
    keywords_df = keywords_to_dataframe(main_keywords)
    suggested_topic = suggest_topic(main_keywords) #Generate topic suggestion
    return keywords_df, suggested_topic

url = 'https://warwick.ac.uk/fac/sci/wmg/news-and-events/news/wmgnews/celebrating_excellence_wmg'
keywords_df, suggested_topic = analyze_article(url)
print(keywords_df)
suggested_topic

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


          keyword  frequency
0             wmg         10
1               ’          8
2        scholars          7
3    scholarships          6
4     scholarship          5
5        students          4
6         funding          4
7           opens          3
8             new          3
9           event          3
10      celebrate          3
11  organisations          3
12           also          3
13      education          3
14              “          3
15              ”          3
16          study          3
17       thursday          2
18        january          2
19           2025          2


'Suggested Topic: wmg, ’, scholars'