In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

# Pathlib bruges til at opsætte sti til fil
from pathlib import Path

### Path til filsti opsættes ###

In [5]:
# Path til filsti opsættes
cwd = Path.cwd()
data_file = Path(r'C:\Users\mpede\source\repos\mpeder75\The Ai engineer course\utils\news_articles.csv')

### Load data ###

In [6]:
# Data indklæses i variabel bbc_data
data = pd.read_csv(data_file)

### Clean Data ###


In [7]:
# take just the content of the article, lowercase and remove punctuation
articles = data['content'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

In [8]:
# stop word removal
en_stopwords = stopwords.words('english')
articles = articles.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [9]:
# tokenize
articles = articles.apply(lambda x: word_tokenize(x))

In [10]:
# stemming (done for speed as we have a lot of text)
ps = PorterStemmer()
articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])

### Vectorization ###

In [11]:
# create dictionary of all words
dictionary = corpora.Dictionary(articles)
print(dictionary)

Dictionary<8693 unique tokens: ['10', '100', '108', '15', '155']...>


In [13]:
# vecotize using bag of words into a document term matrix
doc_term = [dictionary.doc2bow(text) for text in articles]

### LSA Model ###

In [14]:
# specify number of topics
num_topics = 2

In [15]:
# create LSA model
lsamodel = LsiModel(doc_term, num_topics=num_topics, id2word = dictionary)
print(lsamodel.print_topics(num_topics=num_topics, num_words=5))

[(0, '0.615*"mr" + 0.429*"said" + 0.187*"trump" + 0.130*"state" + 0.119*"would"'), (1, '-0.537*"mr" + -0.319*"trump" + 0.286*"said" + 0.242*"saudi" + 0.142*"weight"')]
