In [2]:
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

# sw_indo = stopwords.words('indonesian') + list(punctuation)
sw_english = stopwords.words('english') + list(punctuation)

# Import Dataset

In [3]:
df = pd.read_csv("abcnews-date-text.csv")
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Extraxt BoW

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
bow = CountVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize, stop_words=sw_english, min_df=5)

In [6]:
bow_matrix = bow.fit_transform(df.headline_text)



In [7]:
print(bow_matrix)

  (0, 3293)	1
  (0, 45919)	1
  (0, 36237)	1
  (0, 23530)	1
  (0, 95586)	1
  (1, 4728)	1
  (1, 62043)	1
  (1, 179552)	1
  (1, 108609)	1
  (1, 14368)	1
  (1, 46147)	1
  (1, 4793)	1
  (2, 67689)	1
  (2, 26400)	1
  (2, 83656)	1
  (2, 128102)	1
  (2, 156585)	1
  (2, 67695)	1
  (2, 26564)	1
  (3, 6953)	1
  (3, 114290)	1
  (3, 152785)	1
  (3, 12708)	1
  (3, 155374)	1
  (3, 119513)	1
  :	:
  (1226254, 40936)	1
  (1226255, 136269)	1
  (1226255, 110592)	1
  (1226255, 181999)	1
  (1226255, 102352)	1
  (1226255, 172941)	1
  (1226255, 56227)	1
  (1226255, 111771)	1
  (1226255, 182021)	1
  (1226255, 38806)	1
  (1226255, 38938)	1
  (1226255, 172946)	1
  (1226256, 49687)	1
  (1226256, 95653)	1
  (1226256, 8429)	1
  (1226256, 95934)	1
  (1226256, 177822)	1
  (1226256, 95697)	1
  (1226256, 40935)	1
  (1226257, 27146)	1
  (1226257, 180076)	1
  (1226257, 147541)	1
  (1226257, 169204)	1
  (1226257, 118187)	1
  (1226257, 180124)	1


# Topic Modelling

In [8]:
vocab = bow.get_feature_names()

# LSA dan LDA

## LSA (Latent Semantic Analysis)

In [9]:
len(vocab)

182816

In [10]:
from sklearn.decomposition import TruncatedSVD

In [11]:
lsa = TruncatedSVD(n_components=10, n_iter=100, random_state=42)

In [12]:
lsa_matrix = lsa.fit_transform(bow_matrix)

In [13]:
print(bow_matrix.shape) # hidden
print(lsa_matrix.shape) # weight / code
print(lsa.components_.shape) # fitur /topic

(1226258, 182816)
(1226258, 10)
(10, 182816)


In [15]:
print(lsa_matrix)

[[ 8.48437429e-03  9.56964221e-03  3.41831672e-04 ... -1.13983311e-03
   1.03796584e-02  3.80227737e-03]
 [ 6.67036603e-02  3.61800403e-02  1.43830034e-02 ...  3.22090553e-03
  -6.46245802e-02  4.30760903e-01]
 [ 1.79636421e-02  2.34071709e-02 -1.28003859e-03 ... -1.69362921e-03
   9.57799610e-03 -1.48603256e-03]
 ...
 [ 1.81207967e-01  1.01907853e+00 -7.12200415e-02 ...  4.88163326e-02
  -4.24886917e-02  2.60384679e-02]
 [ 1.61566892e-02  2.90549849e-02  1.02774022e-02 ...  3.28668657e-02
  -9.85355729e-03  1.34479505e-02]
 [ 1.68677241e-02  1.40403422e-02  1.32449905e-02 ...  2.12040440e-02
  -1.53930157e-03  9.84089167e-03]]


## Function Get Topic

In [16]:
def get_topic(model):
  return [[vocab[idx] for idx in reversed(comp.argsort()[-6:]) if vocab[idx].isalnum()] for comp in model.components_]

In [17]:
get_topic(lsa)

[['police', 'man', 'new', 'charged', 'court', 'murder'],
 ['new', 'says', 'australia', 'council', 'govt', 'zealand'],
 ['man', 'charged', 'court', 'murder', 'accused'],
 ['says', 'us', 'govt', 'australia', 'council', 'nsw'],
 ['us', 'court', 'australia', 'govt', 'nsw', 'fire'],
 ['us', 'says', 'new', 'man', 'police', 'iraq'],
 ['court', 'accused', 'face', 'says', 'murder', 'told'],
 ['australia', 'day', 'world', 'cup', 'south'],
 ['council', 'plan', 'water', 'coast', 'gold'],
 ['fire', 'nsw', 'coast', 'crash', 'sydney', 'gold']]

## LDA (Latent Dirichlet Allocation)

In [18]:
from sklearn.decomposition import LatentDirichletAllocation

In [19]:
lda = LatentDirichletAllocation(n_components=10, max_iter=100, random_state=42)

In [20]:
lda_matrix = lda.fit_transform(bow_matrix)

KeyboardInterrupt: 

In [21]:
get_topic(lda)

[['australia', 'day', 'new', 'win', 'china', 'first'],
 ['police', 'coast', 'fire', 'two', 'car', 'gold'],
 ['new', 'minister', 'school', 'world', 'change', 'cup'],
 ['us', 'found', 'missing', 'attack', 'search', 'iraq'],
 ['interview', 'back', 'probe', 'may', 'talks', 'fight'],
 ['government', 'open', 'port', 'country', 'final', 'test'],
 ['water', 'call', 'child', 'sex', 'rural', 'media'],
 ['govt', 'council', 'plan', 'election', 'urged', 'nsw'],
 ['man', 'court', 'crash', 'murder', 'charged', 'dies'],
 ['police', 'year', 'indigenous', 'centre', 'road', 'new']]