In [1]:
import pandas as pd 
npr = pd.read_csv('npr.csv')
npr.head(5)

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
npr['Article'][4]

'From photography, illustration and video, to data visualizations and immersive experiences, visuals are an important part of our storytelling at NPR. Interwoven with the written and the spoken word, images  —   another visual language  —   can create deeper understanding and empathy for the struggles and triumphs we face together. We told a lot of stories in 2016  —   far more than we can list here. So, instead, here’s a small selection of our favorite pieces, highlighting some of the work we’re most proud of, some of the biggest stories we reported, and some of the stories we had the most fun telling. Transport yourself to Rocky Mountain National Park, with all its sights and sounds, in an immersive geology lesson with Oregon State University geology professor Eric Kirby, who discusses the geologic history of the Rockies in   video. ”Today, Indians use much less energy per person than Americans or Chinese people. Many of its 1. 2   population live on roughly $2 a day. But what if all

In [4]:
len(npr['Article'])

11992

## Lets assign topics to each of these documents

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')              

## discard words that are more than 90% of the documents 
## for min the word has to show in two documents
## remove stopwords



##  Make Document Term Matrix

In [8]:
dtm = cv.fit_transform(npr['Article'])  

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
LDA = LatentDirichletAllocation(random_state=42,n_components=7)   ### N_components is the number of topics you want returned

In [11]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Grab the vocab of words

In [12]:
 len(cv.get_feature_names())

54777

In [14]:
type(cv.get_feature_names())

list

In [15]:
cv.get_feature_names()[41000]

'reproductive'

In [27]:
import random 

random_word_id = random.randint(0,54777)

cv.get_feature_names()[random_word_id]

'pigment'

## Grab the topic

In [28]:
len(LDA.components_)

7

In [29]:
type(LDA.components_)

numpy.ndarray

In [30]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [31]:
single_topic = LDA.components_[0]

In [33]:
single_topic.argsort()  ### Take the array of first topic and sort it wth argsort

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [38]:
top_words = single_topic.argsort()[-20:]   ## we argrabbing the ten most important words in the first topic

In [40]:
for index in top_words:
    print(cv.get_feature_names()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


## Now we will get top 20 words for each array of LDA components

In [42]:
for index, topic in enumerate(LDA.components_):
    print(f"The top 20 words for topic ")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print('\n')
    print('\n')

The top 20 words for topic 
['president', 'state', 'tax', 'insurance', 'trump', 'companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




The top 20 words for topic 
['white', 'according', 'attack', 'reported', 'war', 'military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




The top 20 words for topic 
['little', 'know', 'don', 'year', 'make', 'way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




The top 20 words for topic 
['world', 'research', 'university', 'percent', 'care', 'time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




The top 20 words for topic 
['donald', 'political', 'states', 'law', 'just', 'voters', 'vote', 'election', 'party

## Assign the topic to the docment

In [43]:
topic_results = LDA.transform(dtm)

In [44]:
topic_results.shape

(11992, 7)

In [45]:
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [46]:
topic_results[0].argmax()

1

In [47]:
npr['Topic']= topic_results.argmax(axis=1)

In [49]:
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2
