### Import ###

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
import gensim.corpora as corpora

# Pathlib bruges til at opsætte sti til fil
from pathlib import Path

### Path til filsti opsættes ###

In [4]:
# Path til filsti opsættes
cwd = Path.cwd()
data_file = Path(r'/utils/news_articles.csv')

### Load data ###

In [6]:
# Data indklæses i variabel bbc_data
data = pd.read_csv(data_file)

In [7]:
data.head()

Unnamed: 0,id,title,content
0,25626,"One Weight-Loss Approach Fits All? No, Not Eve...","Dr. Frank Sacks, a professor of nutrition at H..."
1,19551,South Carolina Stuns Baylor to Reach the Round...,South Carolina’s win over Duke was not only ...
2,25221,"U.S. Presidential Race, Apple, Gene Wilder: Yo...",(Want to get this briefing by email? Here’s th...
3,18026,"His Predecessor Gone, Gambia’s New President F...","BANJUL, Gambia — A week after he was inaugu..."
4,21063,‘Harry Potter and the Cursed Child’ Goes From ...,The biggest book of the summer isn’t a blockbu...


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       100 non-null    int64 
 1   title    100 non-null    object
 2   content  100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


### Clean data ###

##### Content kolonne #####

In [10]:
# Gemmeer 'Content' kolonne i artioles variabel
articles = data['content']

# 1.lowercase og punctation removal
articles = data['content'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

# 2.stop word removal
en_stopwords = stopwords.words('english')
articles = articles.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

# 3.tokenize
articles = articles.apply(lambda x: word_tokenize(x))

# 4. stemming (done for speed as we have a lot of text)
ps = PorterStemmer()
articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])

In [11]:
articles

0     [dr, frank, sack, professor, nutrit, harvard, ...
1     [south, carolina, win, duke, surpris, fan, pos...
2     [want, get, brief, email, here, good, even, he...
3     [banjul, gambia, week, inaugur, anoth, countri...
4     [biggest, book, summer, isnt, blockbust, thril...
                            ...                        
95    [want, get, brief, email, here, good, even, he...
96    [tallinn, estonia, guard, brought, ahm, abdul,...
97    [gov, scott, walker, wisconsin, activ, wiscons...
98    [social, media, shook, emot, headlin, shout, n...
99    [moment, joanna, acevedo, first, set, foot, bo...
Name: content, Length: 100, dtype: object

### Vectorization ###

In [12]:
# Opret dictonary med alle ord i artiklerne
dictionary = corpora.Dictionary(articles)
print(dictionary)

Dictionary<8693 unique tokens: ['10', '100', '108', '15', '155']...>


In [13]:
# vecotize using bag of words into a document term matrix
doc_term = [dictionary.doc2bow(text) for text in articles]

### LDA Model ###

In [15]:
# specify number of topics
num_topics = 2

In [16]:
# create LDA model
lda_model = gensim.models.LdaModel(corpus=doc_term,
                                   id2word=dictionary,
                                   num_topics=num_topics)

In [17]:
lda_model.print_topics(num_topics=num_topics, num_words=5)

[(0,
  '0.019*"mr" + 0.015*"said" + 0.006*"trump" + 0.004*"one" + 0.004*"would"'),
 (1,
  '0.015*"mr" + 0.014*"said" + 0.005*"trump" + 0.005*"year" + 0.005*"would"')]