In [2]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from transformers import BertTokenizer, BertModel
import torch

# Download stopwords from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
column_names = [
    "News ID",
    "Category",
    "SubCategory",
    "Title",
    "Abstract",
    "URL",
    "Title Entities",
    "Abstract Entities"
]

data = pd.read_csv('news.tsv', sep='\t', header=None, names=column_names)
titles = data['Title']
data['Abstract'].fillna('', inplace=True)
abstracts = data['Abstract']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Abstract'].fillna('', inplace=True)


In [26]:
# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Define the preprocessing function
def preprocess_text(doc):
    """
    Preprocesses the input text by removing stopwords, punctuation, and whitespace.

    Args:
        doc (str): The input text document.

    Returns:
        list: A list of tokens after preprocessing.
    """
    if not isinstance(doc, str):  # Ensure input is a string
        return []

    tokenized_doc = nlp(doc)
    return [token.text for token in tokenized_doc
            if not token.is_space and
               not token.is_stop and
               not token.is_punct]

# Define a function to process a column of text data in a pandas DataFrame
def preprocess_pipeline(dataframe, text_column, output_column):
    """
    Processes a column of text data in a pandas DataFrame.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.
        text_column (str): The name of the column containing text data to preprocess.
        output_column (str): The name of the column to store the processed tokens.

    Returns:
        pd.DataFrame: The DataFrame with the added processed column.
    """
    dataframe[output_column] = dataframe[text_column].apply(preprocess_text)
    return dataframe

# Example usage with a DataFrame
# Load your data here (make sure to adjust the file path)
column_names = [
    "News ID",
    "Category",
    "SubCategory",
    "Title",
    "Abstract",
    "URL",
    "Title Entities",
    "Abstract Entities"
]

data = pd.read_csv('news.tsv', sep='\t', header=None, names=column_names)

data['Abstract'].fillna('', inplace=True)
# Apply the preprocessing pipeline
processed_data = preprocess_pipeline(data, text_column='Abstract', output_column='processed_abstract')

# Display the results
print(processed_data[['Abstract', 'processed_abstract']])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Abstract'].fillna('', inplace=True)


                                                 Abstract  \
0       Shop the notebooks, jackets, and more that the...   
1       Apple's new iPad releases bring big deals on l...   
2       These seemingly harmless habits are holding yo...   
3                                                           
4       Lt. Ivan Molchanets peeked over a parapet of s...   
...                                                   ...   
101522  Sometimes, what happens on the sidelines can b...   
101523  Among the perks of this unique Des Moines home...   
101524  Mark, Jeremiah and Casey were so excited they ...   
101525                                                      
101526                                                      

                                       processed_abstract  
0                [Shop, notebooks, jackets, royals, live]  
1       [Apple, new, iPad, releases, bring, big, deals...  
2       [seemingly, harmless, habits, holding, keeping...  
3                          

In [27]:
data['processed_abstract_str'] = data['processed_abstract'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, list) else '')
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['processed_abstract_str'])

# Inspect the TF-IDF matrix shape
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (101527, 68900)


In [33]:
# Here you would implement PAM or use an existing library if available.
n_topics = 50  # Define the number of topics
nmf_model = NMF(n_components=n_topics, random_state=1)
nmf_model.fit(tfidf_matrix)

In [34]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out(), 5)


Topic 0:
said officials sign newsletters authorities
Topic 1:
weather forecast today week latest
Topic 2:
apartments curious dollar goes amenities
Topic 3:
articles trending readers links headlines
Topic 4:
nfl coach quarterback play head
Topic 5:
trump president donald ukraine vice
Topic 6:
ratings businesses photos rundown hoodline
Topic 7:
new york orleans jersey jets
Topic 8:
school high students district schools
Topic 9:
fire firefighters department crews officials
Topic 10:
year old girl boy years
Topic 11:
world series astros nationals washington
Topic 12:
state michigan penn ohio saturday
Topic 13:
week bye steelers 10 watch
Topic 14:
like end days low rental
Topic 15:
season games regular start nba
Topic 16:
adoption listings pet centers browse
Topic 17:
read neighborhood details opened spot
Topic 18:
morning early tuesday start wednesday
Topic 19:
crash road car highway vehicle
Topic 20:
county sheriff office deputies according
Topic 21:
looking check fun picks entertaining
T

In [35]:
def predict_topic_word(new_abstract):
    """
    Predicts the most representative word for the topic of a new abstract.

    Args:
        new_abstract (str): The new abstract to classify.

    Returns:
        str: The most representative word for the predicted topic.
    """
    # Preprocess the new abstract
    new_processed = preprocess_text(new_abstract)
    # Join tokens into a single string
    new_processed_str = ' '.join(new_processed)
    # Transform using the TF-IDF vectorizer
    new_tfidf = tfidf_vectorizer.transform([new_processed_str])
    # Transform using the NMF model to get topic distribution
    new_topic_distribution = nmf_model.transform(new_tfidf)
    # Get the topic index with the highest probability
    predicted_topic_idx = new_topic_distribution.argmax(axis=1)[0]
    # Get the most representative word for the predicted topic
    topic_word_idx = nmf_model.components_[predicted_topic_idx].argmax()
    most_representative_word = tfidf_vectorizer.get_feature_names_out()[topic_word_idx]
    return most_representative_word

# Example of predicting the most representative word for a new article
new_article = "New advancements in AI are changing the landscape of technology."
predicted_word = predict_topic_word(new_article)
print(f"The most representative word for the topic of the new article is: {predicted_word}")

The most representative word for the topic of the new article is: new


In [36]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Example of getting BERT embeddings for an abstract
bert_embedding_example = get_bert_embeddings(data['processed_abstract'][0])
print(f"BERT embedding shape: {bert_embedding_example.shape}")


BERT embedding shape: (5, 768)


In [24]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def generate_topic_wordclouds(nmf_model, tfidf_vectorizer, num_topics):
    """
    Generates word clouds for all topics from the trained NMF model.

    Args:
        nmf_model: The trained NMF model.
        tfidf_vectorizer: The fitted TF-IDF vectorizer.
        num_topics: The number of topics in the model.
    """
    # Get the feature names (vocabulary) from the TF-IDF vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Loop through each topic
    for topic_idx, topic in enumerate(nmf_model.components_):
        # Create a dictionary of words and their weights
        topic_words = {feature_names[i]: topic[i] for i in topic.argsort()[:-21:-1]}  # Top 20 words
        # Generate a word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(topic_words)

        # Plot the word cloud
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')
        plt.title(f"Topic {topic_idx}", fontsize=16)
        plt.show()

# Example usage
num_topics = nmf_model.components_.shape[0]
generate_topic_wordclouds(nmf_model, tfidf_vectorizer, num_topics)


ModuleNotFoundError: No module named 'wordcloud'