# Topic Modelling

## Using Latent Dirichlet Allocation

In [1]:
import pandas as pd

In [4]:
npr = pd.read_csv('./UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv')

In [5]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [6]:
npr['Article'][4000]

'The headline shocked the   world of the surface Navy: Seven sailors aboard the destroyer USS Fitzgerald were killed, and other crew members injured, when the warship collided with a cargo vessel off Japan. As the Navy family grieves, both it and the wider world are asking the same question: How did this happen? The short answer is that no one knows  —   yet. Official inquiries into what led up to the encounter could take months or more. The Navy and the U. S. Coast Guard both likely will eventually issue reports that describe what happened and could make recommendations for preventing another such accident. ”I will not speculate on how long these investigations will last,” said Vice Adm. Joseph Aucoin, commander of the Navy’s 7th Fleet. The Fitzgerald and the other ships of Destroyer Squadron 15, based outside Tokyo, fall under his authority. There are clues, however, that explain how something like the Fitzgerald’s collision could happen, including photographs of the ships involved, 

In [7]:
## Preprocessing

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_df = 0.9, min_df = 2, stop_words = 'english')

In [10]:
dtm = cv.fit_transform(npr['Article'])

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
LDA = LatentDirichletAllocation(n_components = 7, random_state = 42, n_jobs = -1)

In [14]:
LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, n_jobs=-1, random_state=42)

In [15]:
# Grab the vocabulary of words

# Grab the topics

# Grab the highest probability words per topic

In [17]:
len(cv.get_feature_names())

54777

In [20]:
# Grab the vocabulary

In [21]:
import random

random_word_id = random.randint(0, 54777)

cv.get_feature_names()[random_word_id]

'astrazeneca'

In [22]:
# Grab the topics

In [23]:
len(LDA.components_)

7

In [24]:
LDA.components_.shape

(7, 54777)

In [25]:
single_topic = LDA.components_[0]

In [27]:
#arrgsort is used to return index positions from least to greatest
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 36283, 42561, 42993])

In [28]:

# LAST 10 values of argsort()
single_topic.argsort()[-10:] # grab the last 10 values of .argsort()

array([33390, 36310, 21228,  8149, 31464, 10425, 22673, 36283, 42561,
       42993])

In [32]:
top_twenty_words = single_topic.argsort()[-20:]

In [33]:
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

president
state
tax
trump
insurance
companies
money
year
federal
000
new
percent
government
care
million
company
health
people
said
says


In [34]:
# Grab the highest probability words per topic

In [35]:
for i, topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR THE TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

THE TOP 15 WORDS FOR THE TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'care', 'million', 'company', 'health', 'people', 'said', 'says']




THE TOP 15 WORDS FOR THE TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




THE TOP 15 WORDS FOR THE TOPIC #2
['way', 'world', 'family', 'home', 'day', 'water', 'time', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




THE TOP 15 WORDS FOR THE TOPIC #3
['care', 'don', 'years', 'new', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




THE TOP 15 WORDS FOR THE TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




THE TOP 15 WORDS FOR THE TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 

In [36]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [37]:
npr

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...


In [38]:
topic_results = LDA.transform(dtm)

In [41]:
topic_results[0].argmax()

1

In [42]:
npr['Topic'] = topic_results.argmax(axis = 1)

In [43]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
