### Nickhil Tekwani || CS 6220 || Hw 5a

### Problem 1

In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Load the 20NG dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Preprocess the text data
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
news_groups_tf = vectorizer.fit_transform(newsgroups.data)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] + f"({topic[i]:.2f})"
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20

# Run LDA and NMF
for K in [10, 20, 50]:
    # LDA
    lda_news_groups = LatentDirichletAllocation(n_components=K, max_iter=10, learning_method='online')
    lda_news_groups.fit(news_groups_tf)
    
    # NMF
    nmf_news_groups = NMF(n_components=K, random_state=1)
    nmf_news_groups.fit(news_groups_tf)

    # Print the top 20 words for each topic
    print(f"\nLDA Topics for 20 NewsGroups with K={K}:")
    display_topics(lda_news_groups, vectorizer.get_feature_names_out(), no_top_words)

    print(f"\nNMF Topics for 20 NewsGroups with K={K}:")
    display_topics(nmf_news_groups, vectorizer.get_feature_names_out(), no_top_words)

# Visually inspect the results for the 20NG dataset
# After the topics are printed, you'd need to visually inspect them and see if they align with the 20 Newsgroups categories.
print("\nOriginal 20 NewsGroups Labels:")
print(np.unique(newsgroups.target_names))



LDA Topics for 20 NewsGroups with K=10:
Topic 1:
bye(4.41) jimmy(2.89) motorcycling(2.70) straights(2.31) subscrive(2.19) mcelwaine(2.11) lars(2.03) pneumonia(1.85) gel(1.77) jokerit(1.54) acne(1.44) i486(1.43) vid(1.37) erythromycin(1.32) localhost(1.29) tappara(1.28) timucin(1.23) tps(1.18) mithras(1.17) vasomotor(1.07)
Topic 2:
_i_(2.19) sutcliffe(1.79) nutek(1.74) refrigerator(1.50) f550i(1.48) matchups(1.06) ists(0.92) stpl(0.65) dchhabra(0.65) deformed(0.61) magnifique(0.52) forecasting(0.48) jstmp(0.39) tco(0.33) internships(0.29) brener(0.29) 2650(0.28) murawski(0.28) mieux(0.28) soixante(0.28)
Topic 3:
nanao(5.23) hepatitis(2.97) sore(2.95) asap(2.85) suv(1.82) congruent(1.64) cph(1.56) doubledisk(1.54) maharishi(1.41) boiler(1.26) prognosis(1.26) 550i(1.24) frode(1.20) beamers(1.17) ornaments(1.12) acheive(1.11) er1(1.05) inguiry(1.05) eridan(1.05) chuvashia(1.05)
Topic 4:
ax(10.23) povray(4.38) yo(4.32) jb(4.03) netland(3.34) autodesk(3.21) s4(2.93) mn(2.79) lp(2.14) p2(2.1




LDA Topics for 20 NewsGroups with K=20:
Topic 1:
pmetzger(5.15) sorta(3.40) unsubstantiated(2.07) fud(1.50) divya(1.49) 1776(1.29) untrustworthy(1.14) cave(0.95) behavoir(0.92) _think_(0.84) unalienable(0.64) flattered(0.50) 1787(0.30) techie(0.05) cation(0.05) munching(0.05) vis(0.05) misinformations(0.05) kiwis(0.05) peddling(0.05)
Topic 2:
leaf(7.38) nanao(5.18) oort(3.69) sore(2.90) grbs(2.18) lars(1.99) semite(1.92) crd(1.91) oplinger(1.68) acetaminophen(1.57) cph(1.51) w4w(1.49) crts(1.39) isotropic(1.32) vid(1.32) tappara(1.23) prognosis(1.21) 550i(1.19) touchy(1.18) tps(1.13)
Topic 3:
iff(3.91) subscrive(2.14) rows(1.95) iges(1.82) folded(1.81) proteins(1.76) bitzm(1.67) dsu(1.67) informations(1.57) hijaak(1.57) nanoseconds(1.24) haston(0.96) newpaper(0.90) gorgeous(0.83) resurrecting(0.81) magitronic(0.60) abiogenesis(0.54) jamesc(0.45) 1024kb(0.39) disgustingly(0.23)
Topic 4:
compaq(4.18) sandberg(3.62) tournament(2.46) hypercard(2.27) straights(2.26) tasking(2.25) pagemaker




LDA Topics for 20 NewsGroups with K=50:
Topic 1:
sabbath(6.61) gal(1.43) transgression(1.12) sola(1.10) fenholt(0.94) scriptura(0.70) adma(0.60) reblled(0.60) cooperativley(0.60) musician(0.42) constructor(0.40) covenent(0.36) endorsing(0.02) suction(0.02) ceremonial(0.02) alcoholics(0.02) covenants(0.02) law(0.02) worship(0.02) headbanger(0.02)
Topic 2:
just(232.70) don(232.33) people(215.48) like(214.24) think(206.67) know(192.42) time(162.69) good(160.77) does(148.64) god(140.11) new(132.33) right(130.59) did(129.26) ve(128.65) way(127.05) make(126.89) say(125.87) really(116.48) want(113.08) use(110.44)
Topic 3:
sub(0.02) sorta(0.02) indian(0.02) eh(0.02) yeah(0.02) right(0.02) like(0.02) clarifies(0.02) hq(0.02) ambitious(0.02) compatible(0.02) microcontroller(0.02) fs2(0.02) sundry(0.02) hardplastic(0.02) 10012(0.02) robert_reichel(0.02) lucas(0.02) gvz(0.02) armageddon(0.02)
Topic 4:
70k(1.38) hams(1.07) locomotives(0.50) detectors(0.02) radar(0.02) detector(0.02) fusi(0.02) 609

god(3.95) believe(0.33) hell(0.27) faith(0.27) sin(0.25) lord(0.25) existence(0.23) love(0.23) bible(0.23) man(0.22) eternal(0.19) belief(0.19) heaven(0.17) satan(0.17) son(0.16) life(0.14) spirit(0.14) truth(0.14) exist(0.14) created(0.13)
Topic 4:
drive(4.96) hard(1.43) disk(1.40) drives(1.14) floppy(0.89) ide(0.65) controller(0.49) boot(0.41) meg(0.39) cd(0.36) bios(0.36) hd(0.35) slave(0.31) internal(0.31) seagate(0.31) external(0.30) rom(0.28) mb(0.27) tape(0.27) dos(0.26)
Topic 5:
government(4.88) law(1.82) right(1.60) rights(1.24) encryption(1.04) state(0.81) federal(0.78) private(0.74) clinton(0.73) president(0.71) constitution(0.70) court(0.66) citizens(0.66) privacy(0.63) amendment(0.62) public(0.53) enforcement(0.53) administration(0.52) legal(0.50) security(0.50)
Topic 6:
thanks(4.41) advance(1.72) hi(1.22) looking(0.82) help(0.73) info(0.69) appreciated(0.62) email(0.51) information(0.49) hello(0.42) appreciate(0.40) wondering(0.39) anybody(0.34) tell(0.31) greatly(0.30) n

### Problem 2

Note: only did this problem on 20G dataset. DUC not available online

In [9]:
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
data = newsgroups.data

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

def preprocess(text):
    sentences = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sentences]
    return sentences, words

from collections import defaultdict
import numpy as np

def word_frequencies(texts):
    freqs = defaultdict(int)
    for text in texts:
        for word in word_tokenize(text):
            freqs[word.lower()] += 1
    total_words = sum(freqs.values())
    for word in freqs:
        freqs[word] /= total_words
    return freqs

def KL_divergence(P, Q):
    return sum(P[word] * np.log(P[word] / Q[word]) for word in P if word in Q)

def KL_summary(text, n_sentences=5):
    sentences, tokenized_sentences = preprocess(text)
    PD = word_frequencies(sentences)
    
    summary = []
    for _ in range(n_sentences):
        remaining_sentences = list(set(sentences) - set(summary))
        
        # Check if we've run out of sentences to summarize
        if not remaining_sentences:
            break
        
        next_sentence = min(remaining_sentences, key=lambda s: KL_divergence(PD, word_frequencies([s])))
        summary.append(next_sentence)
    
    return ' '.join(summary)

from gensim.corpora import Dictionary
from gensim.models import LdaModel

def LDA_topics(texts, num_topics=10):
    # Only get the tokenized texts (a flat list of words for each text)
    tokenized_texts = [word_tokenize(text) for text in texts]
    
    dictionary = Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
    
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    
    word_probs = defaultdict(float)
    for topic_id in range(num_topics):
        for word, prob in lda.show_topic(topic_id, topn=len(dictionary)):
            word_probs[word] += prob
    total = sum(word_probs.values())
    for word in word_probs:
        word_probs[word] /= total
    return word_probs


def LDA_summary(text, n_sentences=5):
    sentences, _ = preprocess(text)
    PD = LDA_topics(sentences)
    
    summary = []
    for _ in range(n_sentences):
        remaining_sentences = list(set(sentences) - set(summary))
        
        # Check if we've run out of sentences to summarize
        if not remaining_sentences:
            break
        
        next_sentence = min(remaining_sentences, key=lambda s: KL_divergence(PD, word_frequencies([s])))
        summary.append(next_sentence)
    
    return ' '.join(summary)


In [10]:
# Load the dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data

# just a few documents for demo purposes
num_docs_to_summarize = 5

# Summarizing using KL based on word frequencies
print("Summarizing using KL based on word frequencies:")
for i in range(num_docs_to_summarize):
    print(f"\nDocument {i + 1} Original:")
    print(documents[i][:500] + "...")  # Printing just the beginning for brevity.
    print(f"\nDocument {i + 1} KL Summary:")
    print(KL_summary(documents[i]))
    print("="*80)

# Summarizing using KL based on LDA topics
print("\nSummarizing using KL based on LDA topics:")
for i in range(num_docs_to_summarize):
    print(f"\nDocument {i + 1} Original:")
    print(documents[i][:500] + "...")  # Printing just the beginning for brevity.
    print(f"\nDocument {i + 1} LDA Summary:")
    print(LDA_summary(documents[i]))
    print("="*80)


Summarizing using KL based on word frequencies:

Document 1 Original:


I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a...

Document 1 KL Summary:


I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game. Actually,
I am  bit puzzled too and a bit re