In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('ggplot')
sns.set(style='whitegrid')

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')


In [None]:
def load_data(file_path):
    # Read JSON file line by line (more memory efficient for large files)
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            data.append(json.loads(line))
            if i == 10000:  # Load a subset for demonstration purposes
                break
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# File path
file_path = "archive/News_Category_Dataset_v3.json"

# Load data
print("Loading data...")
df = load_data(file_path)

# Display basic information
print(f"Dataset Shape: {df.shape}")
print("\nColumns in the dataset:")
print(df.columns.tolist())
print("\nSample data:")
df.head()


In [None]:
# Explore data further
# Check category distribution
category_counts = df['category'].value_counts()

# Visualize top 10 categories
plt.figure(figsize=(12, 6))
sns.barplot(x=category_counts.values[:10], y=category_counts.index[:10])
plt.title('Top 10 News Categories')
plt.xlabel('Count')
plt.ylabel('Category')
plt.tight_layout()
plt.show()


In [None]:
def preprocess_text(text):
    """
    Apply all preprocessing steps in sequence
    """
    # Step 1 & 2: Convert to lowercase (before tokenization so proper handling of case-sensitive tokens)
    text = text.lower()
    
    # Remove special characters and numbers (helps with tokenization)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Step 3: Tokenization
    tokens = word_tokenize(text)
    
    # Step 4: Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Step 5: Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

def apply_pos_tagging(text):
    """
    Apply POS tagging to text
    """
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    return tagged

# Preprocess headlines
print("Preprocessing headlines...")
df['processed_headline'] = df['headline'].apply(preprocess_text)

# Display example of original vs processed headlines
sample_headlines = df[['headline', 'processed_headline']].head(5)
sample_headlines


In [None]:
# Demonstration of POS tagging on a sample headline
sample_headline = df['headline'].iloc[0]
tagged_words = apply_pos_tagging(sample_headline)

print(f"Original headline: {sample_headline}")
print("\nPOS tags:")
for word, tag in tagged_words:
    print(f"{word}: {tag}")


In [None]:
def vectorize_bow(texts, max_features=5000):
    """
    Vectorize using Bag of Words
    """
    vectorizer = CountVectorizer(max_features=max_features)
    bow_matrix = vectorizer.fit_transform(texts)
    return bow_matrix, vectorizer

def vectorize_tfidf(texts, max_features=5000):
    """
    Vectorize using TF-IDF
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix, vectorizer

print("Applying vectorization methods...")
# Bag of Words
bow_matrix, bow_vectorizer = vectorize_bow(df['processed_headline'])

# TF-IDF
tfidf_matrix, tfidf_vectorizer = vectorize_tfidf(df['processed_headline'])

print(f"Bag of Words matrix shape: {bow_matrix.shape}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


In [None]:
# Compare vectorization methods for a sample headline
sample_idx = 0
sample_headline_processed = df['processed_headline'].iloc[sample_idx]
original_headline = df['headline'].iloc[sample_idx]

print(f"Original headline: {original_headline}")
print(f"Processed headline: {sample_headline_processed}")

# Get BoW representation
sample_bow = bow_vectorizer.transform([sample_headline_processed])
sample_bow_array = sample_bow.toarray()[0]
sample_bow_features = bow_vectorizer.get_feature_names_out()

# Get TF-IDF representation
sample_tfidf = tfidf_vectorizer.transform([sample_headline_processed])
sample_tfidf_array = sample_tfidf.toarray()[0]
sample_tfidf_features = tfidf_vectorizer.get_feature_names_out()

# Show non-zero elements for both
print("\nBag of Words representation (non-zero elements):")
for idx in np.nonzero(sample_bow_array)[0][:10]:  # Show first 10 elements
    print(f"{sample_bow_features[idx]}: {sample_bow_array[idx]}")

print("\nTF-IDF representation (non-zero elements):")
for idx in np.nonzero(sample_tfidf_array)[0][:10]:  # Show first 10 elements
    print(f"{sample_tfidf_features[idx]}: {sample_tfidf_array[idx]:.4f}")


In [None]:
def analyze_sentiment_textblob(text):
    """
    Analyze sentiment using TextBlob
    """
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

def analyze_sentiment_vader(text):
    """
    Analyze sentiment using VADER
    """
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    
    if sentiment_scores['compound'] >= 0.05:
        return 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

print("Performing sentiment analysis...")

# Use a sample of headlines for demonstration purposes
sample_size = min(1000, len(df))
sample_df = df.sample(sample_size, random_state=42)

# TextBlob
sample_df['sentiment_textblob'] = sample_df['headline'].apply(analyze_sentiment_textblob)

# VADER
sample_df['sentiment_vader'] = sample_df['headline'].apply(analyze_sentiment_vader)

# Compare sentiment analysis results
textblob_sentiments = sample_df['sentiment_textblob'].value_counts()
vader_sentiments = sample_df['sentiment_vader'].value_counts()

print("\nTextBlob Sentiment Distribution:")
print(textblob_sentiments)

print("\nVADER Sentiment Distribution:")
print(vader_sentiments)


In [None]:
# Visualize sentiment analysis results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=textblob_sentiments.index, y=textblob_sentiments.values)
plt.title('TextBlob Sentiment Distribution')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.barplot(x=vader_sentiments.index, y=vader_sentiments.values)
plt.title('VADER Sentiment Distribution')
plt.ylabel('Count')

plt.tight_layout()
plt.show()


In [None]:
# Compare sentiment analysis methods with examples
print("Examples of headlines with different sentiment classifications:")

# Find examples where TextBlob and VADER disagree
disagreement = sample_df[sample_df['sentiment_textblob'] != sample_df['sentiment_vader']].head(5)
print("\nDisagreement between TextBlob and VADER:")
for i, row in disagreement.iterrows():
    print(f"Headline: {row['headline']}")
    print(f"TextBlob: {row['sentiment_textblob']}, VADER: {row['sentiment_vader']}")
    
    # Show detailed scores
    tb = TextBlob(row['headline'])
    sid = SentimentIntensityAnalyzer()
    vader_scores = sid.polarity_scores(row['headline'])
    
    print(f"TextBlob polarity: {tb.sentiment.polarity:.4f}")
    print(f"VADER scores: {vader_scores}")
    print("-" * 80)


In [None]:
def perform_lda(vector_matrix, vectorizer, num_topics=10):
    """
    Perform LDA topic modeling
    """
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(vector_matrix)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract topics
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-11:-1]  # Get top 10 words
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
        
    return lda, topics

def perform_nmf(vector_matrix, vectorizer, num_topics=10):
    """
    Perform NMF topic modeling
    """
    nmf = NMF(n_components=num_topics, random_state=42)
    nmf.fit(vector_matrix)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract topics
    topics = []
    for topic_idx, topic in enumerate(nmf.components_):
        top_words_idx = topic.argsort()[:-11:-1]  # Get top 10 words
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(top_words)
        
    return nmf, topics

# Perform topic modeling with a smaller number of topics for faster execution
num_topics = 5

print("\nRunning LDA with TF-IDF...")
lda_model, lda_topics = perform_lda(tfidf_matrix, tfidf_vectorizer, num_topics=num_topics)

print("\nRunning NMF with TF-IDF...")
nmf_model, nmf_topics = perform_nmf(tfidf_matrix, tfidf_vectorizer, num_topics=num_topics)


In [None]:
# Print topics from LDA
print("\nLDA Topics:")
for i, topic_words in enumerate(lda_topics):
    print(f"Topic {i+1}: {', '.join(topic_words)}")

# Print topics from NMF
print("\nNMF Topics:")
for i, topic_words in enumerate(nmf_topics):
    print(f"Topic {i+1}: {', '.join(topic_words)}")


In [None]:
# Try to visualize topics using WordCloud
try:
    from wordcloud import WordCloud
    
    # Function to create and display wordcloud
    def display_wordcloud(topic_words, title):
        wordcloud = WordCloud(
            background_color='white',
            width=800,
            height=400,
            max_words=100
        ).generate(' '.join(topic_words))
        
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title)
        plt.show()
    
    # Display LDA topics
    for idx, topic in enumerate(lda_topics):
        display_wordcloud(topic, f'LDA Topic {idx+1}')
    
    # Display NMF topics
    for idx, topic in enumerate(nmf_topics):
        display_wordcloud(topic, f'NMF Topic {idx+1}')
        
except ImportError:
    print("WordCloud package not installed, skipping topic visualization")
    print("You can install it using: pip install wordcloud")


In [None]:
# Interpret and provide suggested topic names
print("\nTopic Interpretation and Suggested Names:")
print("\nLDA Topics:")
for i, topic_words in enumerate(lda_topics):
    print(f"Topic {i+1}: {', '.join(topic_words)}")
    # The naming would be done manually in a real analysis
    print("Suggested name: Requires human interpretation based on word patterns\n")

print("\nNMF Topics:")
for i, topic_words in enumerate(nmf_topics):
    print(f"Topic {i+1}: {', '.join(topic_words)}")
    # The naming would be done manually in a real analysis
    print("Suggested name: Requires human interpretation based on word patterns\n")
