### Exersice 1 

In [34]:
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pandas as pd


In [35]:
documents = [
    "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
    "Rafael Nadal Is Out of the Australian Open",
    "Biden Announces Virus Measures",
    "Biden's Virus Plans Meet Reality",
    "Where Biden's Virus Plan Stands"
]


In [36]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [t for t in tokens if t.isalnum() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

pre_docs = [preprocess(doc) for doc in documents]
pre_docs

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

In [37]:
dictionary = corpora.Dictionary(pre_docs)
corpus = [dictionary.doc2bow(doc) for doc in pre_docs]



In [38]:
lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)


In [39]:
article_labels = []

for i, doc in enumerate(pre_docs):
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics,key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)

In [40]:
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

print("Table with Articles and Topic:")
print(df)
print()

Table with Articles and Topic:
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      1
1         Rafael Nadal Is Out of the Australian Open      1
2                     Biden Announces Virus Measures      0
3                   Biden's Virus Plans Meet Reality      0
4                    Where Biden's Virus Plan Stands      0



In [41]:
print("Top Terms for Each Topic:")

for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}: {topic}")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()


Top Terms for Each Topic:
Topic 0: 0.166*"biden" + 0.166*"virus" + 0.119*"plan" + 0.071*"reality" + 0.071*"meet" + 0.071*"announces" + 0.071*"measure" + 0.071*"stand" + 0.024*"open" + 0.024*"nadal"
- "biden" (weight: 0.166)
- "virus" (weight: 0.166)
- "plan" (weight: 0.119)
- "reality" (weight: 0.071)
- "meet" (weight: 0.071)
- "announces" (weight: 0.071)
- "measure" (weight: 0.071)
- "stand" (weight: 0.071)
- "open" (weight: 0.024)
- "nadal" (weight: 0.024)

Topic 1: 0.131*"rafael" + 0.131*"nadal" + 0.131*"open" + 0.079*"australian" + 0.079*"missing" + 0.079*"roger" + 0.079*"federer" + 0.079*"join" + 0.027*"virus" + 0.027*"plan"
- "rafael" (weight: 0.131)
- "nadal" (weight: 0.131)
- "open" (weight: 0.131)
- "australian" (weight: 0.079)
- "missing" (weight: 0.079)
- "roger" (weight: 0.079)
- "federer" (weight: 0.079)
- "join" (weight: 0.079)
- "virus" (weight: 0.027)
- "plan" (weight: 0.027)

