In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import string
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import spacy
import re

In [3]:
from warnings import simplefilter
simplefilter("ignore")

In [4]:
data = pd.read_csv('bbc_news_text_complexity_summarization.csv')

data.head()

Unnamed: 0,text,labels,no_sentences,Flesch Reading Ease Score,Dale-Chall Readability Score,text_rank_summary,lsa_summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,26,62.17,9.72,It hopes to increase subscribers by offering t...,Its profits were buoyed by one-off gains which...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,17,65.56,9.09,The dollar has hit its highest level against t...,"""I think the chairman's taking a much more san..."
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,14,69.21,9.66,The owners of embattled Russian oil giant Yuko...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,business,24,62.98,9.86,Looking ahead to its full year results to Marc...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,17,70.63,10.23,Reports in the Wall Street Journal and the Fin...,Shares in UK drinks and food firm Allied Domec...


In [5]:
nlp = spacy.load("en_core_web_sm")


In [6]:
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans(' ',' ',string.punctuation))
    text = re.sub('\[.*?\]','',text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+','',text)
    text = re.sub('\n','',text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub(r'\d+',' ',text)    
    text = re.sub(r'\s+',' ',text,flags=re.I)

    stop = stopwords.words('english')
    text_split = text.split()
    result = [word for word in text_split if word not in stop]
    text = ' '.join(result) 
        
    # Remove words shorter than 2 characters
    text_split = text.split()
    result = [word for word in text_split if len(word)>2]
    text = ' '.join(result) 
    # Using SpaCy for further processing and helps in recognising named entities
    doc = nlp(text)

    named_entities = set(ent.text.lower() for ent in doc.ents)

    cleaned_tokens = []
    for token in doc:
        if (
            token.text not in STOP_WORDS and 
            not token.is_punct and 
            not token.is_space and
            token.text.lower() not in named_entities
        ):
            cleaned_tokens.append(token.lemma_)

    cleaned_tokens = ' '.join(cleaned_tokens)
    
    return text

data['clean text']  = data['text'].apply(clean_text)



In [7]:
data.head()

Unnamed: 0,text,labels,no_sentences,Flesch Reading Ease Score,Dale-Chall Readability Score,text_rank_summary,lsa_summary,clean text
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,26,62.17,9.72,It hopes to increase subscribers by offering t...,Its profits were buoyed by one-off gains which...,sales boost time warner profitquarterly profit...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,17,65.56,9.09,The dollar has hit its highest level against t...,"""I think the chairman's taking a much more san...",dollar gains greenspan speechthe dollar hit hi...
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,14,69.21,9.66,The owners of embattled Russian oil giant Yuko...,Yukos' owner Menatep Group says it will ask Ro...,yukos unit buyer faces loan claimthe owners em...
3,High fuel prices hit BA's profits\n\nBritish A...,business,24,62.98,9.86,Looking ahead to its full year results to Marc...,"Rod Eddington, BA's chief executive, said the ...",high fuel prices hit bas profitsbritish airway...
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,17,70.63,10.23,Reports in the Wall Street Journal and the Fin...,Shares in UK drinks and food firm Allied Domec...,pernod takeover talk lifts domecqshares drinks...


In [8]:
common_words = ['said']

def text_cleaning(data):
    return ' '.join(i for i in data.split() if i not in common_words)

data["clean text"] = data["clean text"].apply(text_cleaning)

data.head(5)

Unnamed: 0,text,labels,no_sentences,Flesch Reading Ease Score,Dale-Chall Readability Score,text_rank_summary,lsa_summary,clean text
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,26,62.17,9.72,It hopes to increase subscribers by offering t...,Its profits were buoyed by one-off gains which...,sales boost time warner profitquarterly profit...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,17,65.56,9.09,The dollar has hit its highest level against t...,"""I think the chairman's taking a much more san...",dollar gains greenspan speechthe dollar hit hi...
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,14,69.21,9.66,The owners of embattled Russian oil giant Yuko...,Yukos' owner Menatep Group says it will ask Ro...,yukos unit buyer faces loan claimthe owners em...
3,High fuel prices hit BA's profits\n\nBritish A...,business,24,62.98,9.86,Looking ahead to its full year results to Marc...,"Rod Eddington, BA's chief executive, said the ...",high fuel prices hit bas profitsbritish airway...
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,17,70.63,10.23,Reports in the Wall Street Journal and the Fin...,Shares in UK drinks and food firm Allied Domec...,pernod takeover talk lifts domecqshares drinks...


TFIDF Vectorization

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(data['clean text'])
feature_names = tfidf_vectorizer.get_feature_names_out()


Tokenizer for Transformer based models

In [10]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(
    list(data['clean text']),
    truncation=True,
    padding=True,
    max_length=512,  
    return_tensors='pt' 
)

input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']

Traditional ML Classification (Logistic Regression / Random Forest)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [12]:
X = X_tfidf
y = data['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
#LOGISTIC REGRESSION

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [14]:
#RANDOM FOREST

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:

import pickle

# Save Logistic Regression model
with open('models/logistic_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

# Save Random Forest model
with open('models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


# import pickle

# # Load Logistic Regression model
# with open('logistic_model.pkl', 'rb') as f:
#     lr_model = pickle.load(f)

# # Load Random Forest model
# with open('random_forest_model.pkl', 'rb') as f:
#     rf_model = pickle.load(f)

# # Example usage:
# # prediction = lr_model.predict(X_test)


In [26]:
import torch
from transformers import pipeline, PegasusForConditionalGeneration, AutoTokenizer

In [None]:
#pegasus_summarize (Abstractive Summarizattion)
def pegasus_summarize(text):
    model_name = 'google/pegasus-cnn_dailymail'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
    batch = tokenizer(text, truncation=True, padding='longest', return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    tgt_text = tgt_text.replace('', '\n')
    return tgt_text

In [29]:
from bertopic import BERTopic





In [30]:

def topic_modeling(self, texts):
        """Topic modeling using BERTopic"""
        try:
            self.topic_model = BERTopic(nr_topics=5, verbose=False)
            topics, probs = self.topic_model.fit_transform(texts)
            return topics, probs
        except Exception as e:
            st.error(f"Error in topic modeling: {str(e)}")
            return None, None
    
def get_topic_keywords(self, text):
    """Get keywords for a text using topic model"""
    if self.topic_model is None:
        return []
        
    try:
        topic_info = self.topic_model.transform([text])
        topic_id = topic_info[0][0]
            
        if topic_id != -1:
            keywords = self.topic_model.get_topic(topic_id)
            return [word for word, _ in keywords[:10]]
        else:
            return []
    except:
        return []