In [2]:
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# sw_indo = stopwords.words('indonesian') + list(punctuation)
sw_english = stopwords.words('english') + list(punctuation)

# Import Dataset

In [3]:
df = pd.read_csv("abcnews-date-text.csv")
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Extraxt BoW

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
bow = CountVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize, stop_words=sw_english, min_df=5)

In [8]:
bow_matrix = bow.fit_transform(df.headline_text)



# Topic Modelling

In [9]:
vocab = bow.get_feature_names()

# LSA dan LDA

## LSA (Latent Semantic Analysis)

In [10]:
len(vocab)

182816

In [11]:
from sklearn.decomposition import TruncatedSVD

In [12]:
lsa = TruncatedSVD(n_components=10, n_iter=100, random_state=42)

In [13]:
lsa_matrix = lsa.fit_transform(bow_matrix)

In [14]:
print(bow_matrix.shape) # hidden
print(lsa_matrix.shape) # weight / code
print(lsa.components_.shape) # fitur /topic

(1226258, 182816)
(1226258, 10)
(10, 182816)


## Function Get Topic

In [18]:
def get_topic(model):
  return [[vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()] for comp in model.components_]

In [19]:
get_topic(lsa)

[['police', 'man', 'new', 'charged', 'court', 'murder'],
 ['new', 'says', 'australia', 'council', 'govt', 'zealand'],
 ['man', 'charged', 'court', 'murder', 'accused'],
 ['says', 'us', 'govt', 'australia', 'council', 'nsw'],
 ['us', 'court', 'australia', 'govt', 'nsw', 'fire'],
 ['us', 'says', 'new', 'man', 'police', 'iraq'],
 ['court', 'accused', 'face', 'says', 'murder', 'told'],
 ['australia', 'day', 'world', 'cup', 'south'],
 ['council', 'plan', 'water', 'coast', 'gold'],
 ['fire', 'nsw', 'coast', 'crash', 'sydney', 'gold']]

## LDA (Latent Dirichlet Allocation)

In [20]:
from sklearn.decomposition import LatentDirichletAllocation

In [21]:
lda = LatentDirichletAllocation(n_components=10, max_iter=100, random_state=42)

In [22]:
!lda_matrix = lda.fit_transform(bow_matrix)

KeyboardInterrupt: 

In [23]:
get_topic(lda)

[['australia', 'new', 'win', 'day', 'china', 'first'],
 ['police', 'coast', 'fire', 'two', 'car', 'gold'],
 ['new', 'minister', 'world', 'school', 'cup', 'change'],
 ['us', 'found', 'missing', 'attack', 'search', 'trial'],
 ['interview', 'back', 'probe', 'may', 'talks', 'fire'],
 ['government', 'port', 'country', 'test', 'final', 'tour'],
 ['water', 'child', 'call', 'sex', 'rural', 'charges'],
 ['govt', 'council', 'election', 'nsw', 'plan', 'labor'],
 ['man', 'court', 'crash', 'murder', 'charged', 'says'],
 ['police', 'year', 'new', 'centre', 'indigenous', 'road']]