In [17]:
# Import and load the 20 Newsgroups dataset
# Save the data to the documents variable

from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [19]:
# View the first document in the dataset
# Indexing in Python starts at 0.
# To look at the 5th document, for example, put 4 in the square brackets

documents[4]

# You can also use a technique called slicing to view more than one document
# documents[3:6] will return the 4th, 5th and 6th documents.
# Jupyter limits how much it will display so there is no benefit to making the slices too large.

"Well, I will have to change the scoring on my playoff pool.  Unfortunately\nI don't have time right now, but I will certainly post the new scoring\nrules by tomorrow.  Does it matter?  No, you'll enter anyway!!!  Good!\n\n--\n    Keith Keller\t\t\t\tLET'S GO RANGERS!!!!!\n\t\t\t\t\t\tLET'S GO QUAKERS!!!!!\n\tkkeller@mail.sas.upenn.edu\t\tIVY LEAGUE CHAMPS!!!!"

In [20]:
# Import libraries for turning text to numeric vector representations
# It is essential to convert text in this way as machine learning algorithms are
# mathematically based and therefore rely on numerical input.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# This variable is used by the next two steps to control the number of features to be extracted from the text
# In general the more features in your model, the more rows of data you need to get accurate results
# The algorithm will pick the best features which basically means the words which appear most regularly
# If a word only appears once in the whole corpus it will not be much use for the model.
# Experiment with this value to see what effect it has. It can also be set to None if you don't want a maximum value
no_features = 1000

In [21]:
# This function creates a Term Frequency-Inverse Document Frequency (TF-IDF) matrix
# TF-IDF generates a weighting for each word in a document
# This weighting is based on how important the word is to the document itself,
# and how common that word is in the entire set of documents
# For example, an article about golf may have several mentions of the words: green; club; par
# but if it amongst a set of documents about gardening then the word green may have a lower weighting.
# The TfidfVectorizer function takes 4 parameters:
#    max_df = 0.5 means "ignore terms that appear in more than 50% of the documents".
#    min_df = 10 means "ignore terms that appear in fewer than 10 documents".
#    max_features was explained above
#    stop_words are common words such as: the;an;it; which appear so regularly that they just get in the way of the model.
#         This parameter will strip them from the text, and it just says to use a list of English stop words.
#         In theory TfIdf would weight them very low anyway so it shouldn't make much difference removing them.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

# Convert the text in documents to TFIDF
tfidf = tfidf_vectorizer.fit_transform(documents)

# This will return a list of the unique words used as features. Particularly useful if max_features has been used
# because you can see which were retained.
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


In [22]:
# An alternative representation to TFIDF is to just count words. The parameters for this function are the same as before
# and in this case removing stop words will have much more of an effect.
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [23]:
# Import two topic modelling functions
# Non-negative Matrix Factorization (NMF) and Latent Dirichlet Allocation (LDA)
# They both perform the same task but just used different methods to achieve it
# Depending on the source text one may be better than the other so just try both to see how they get on
from sklearn.decomposition import NMF, LatentDirichletAllocation

# This is the most important parameter for topic modelling. You have to tell the algorithm how many topics to find.
# For this tutorial 20 is the most appropriate value because we already know that the corpus has been categorised
# with that many topics. A good way to start would be to begin with 4 or 5 to get a nigh level view of the topics
# that it finds and then see what happens when you expand the value.
no_topics = 20

# From a practical point of view the main difference between the two functions (NMF and LDA) is that
# NMF uses the tfidf matrix as input whereas LDA requires a matrix of word counts.

# Run NMF - leave the other a parameters at their defaults
# This function basically takes the tfidf matrix and calculates the topics
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


In [24]:

# Run LDA - this time passing the count matrix.
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [25]:
# This function will return the top n words for each topic found by the model
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))



In [28]:
no_top_words = 10

# Display the topics for the NMF model
display_topics(nmf, tfidf_feature_names, no_top_words)


Topic 0:
people time right did good said say make way government
Topic 1:
window problem using server application screen display motif manager running
Topic 2:
god jesus bible christ faith believe christian christians sin church
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 15
Topic 5:
thanks mail advance hi looking info help information address appreciated
Topic 6:
windows file files dos program version ftp ms directory running
Topic 7:
edu soon cs university ftp internet article email pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just ll thought tell oh little fine work wanted mean
Topic 11:
does know anybody mean work say doesn help exist program
Topic 12:
card video monitor cards drivers bus vga driver color memory
Topic 13:
like sounds looks look bike sound lot things really thing
To

In [27]:
# Display the topics for the LDA model
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
people gun state control right guns crime states law police
Topic 1:
time question book years did like don space answer just
Topic 2:
mr line rules science stephanopoulos title current define int yes
Topic 3:
key chip keys clipper encryption number des algorithm use bit
Topic 4:
edu com cs vs w7 cx mail uk 17 send
Topic 5:
use does window problem way used point different case value
Topic 6:
windows thanks know help db does dos problem like using
Topic 7:
bike water effect road design media dod paper like turn
Topic 8:
don just like think know people good ve going say
Topic 9:
car new price good power used air sale offer ground
Topic 10:
file available program edu ftp information files use image version
Topic 11:
ax max b8f g9v a86 145 pl 1d9 0t 34u
Topic 12:
government law privacy security legal encryption court fbi technology information
Topic 13:
card bit memory output video color data mode monitor 16
Topic 14:
drive scsi disk mac hard apple drives controller software port
T

In [43]:
# This code will show which is the most likely topic for each document
# The transform command creates a vector of probabilities for each topic per document
# The argmax function will just pick the value in the vector which is highest and return the number of the topic
# An alternative would be to allow a document to be represented by multiple topics - see next cell down
# The last two lines output the document number, most likely topic, and the first 200 characters of the document
doc_topic = lda.transform(tfidf)
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n".format(n,topic_most_pr))
    print(documents[n].replace('\n',' ')[0:200] + '\n')

doc: 0 topic: 7

Well i'm not sure about the story nad it did seem biased. What I disagree with is your statement that the U.S. Media is out to ruin Israels reputation. That is rediculous. The U.S. media is the most p

doc: 1 topic: 15

       Yeah, do you expect people to read the FAQ, etc. and actually accept hard atheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out of steam!        Jim,  Sorry I can't pity yo

doc: 2 topic: 1

Although I realize that principle is not one of your strongest points, I would still like to know why do do not ask any question of this sort about the Arab countries.     If you want to continue this

doc: 3 topic: 3

Notwithstanding all the legitimate fuss about this proposal, how much of a change is it?  ATT's last product in this area (a) was priced over $1000, as I suspect 'clipper' phones will be; (b) came to 

doc: 4 topic: 2

Well, I will have to change the scoring on my playoff pool.  Unfortunately I don't have time right now, bu

 Unworthy of comment.   Nor would they have died if they had come out with their hands empty. That is undeniable truth.  My heart bleeds just as much as yours for  the children who were never released

doc: 932 topic: 1

Ron Roth recommends: "Once you have your hypoglycemia CONFIRMED through the                          proper channels, you might consider ther following:..."                         [diet omitted]  1) 

doc: 933 topic: 14

 The third-party media adapters are usually cheaper (at least in Toronto) than Apple's. I bought the adapters from Asante instead of Apple.   That's not true. Only the DECstation 5000/200 comes with a

doc: 934 topic: 9

 	That I did not do; however, the sample bolt I took to the store fit rather well in the following:  1/2" open end wrench, 1/2" box end wrench, 1/2" 12-point normal socket.  I take that as meaning it'

doc: 935 topic: 10

This may be an FAQ, but I dont know where to get the FAQ list! My OpenLook application has a few buttons. The firs



doc: 1931 topic: 6

Do you or does anyone you know have a wrecked 1981 or later R80(anything) or R100(anything) that they are interested in getting rid of?  I need a motor, but will buy a whole bike.  email replies to:	D

doc: 1932 topic: 6

     --  Gosh..I think I just installed a virus..It was called MS DOS6... Don't copy that floppy..BURN IT...I just love Windows...CRASH... 

doc: 1933 topic: 2

   Hm, do you think Dusseldorf fans would like it if their team joined the NHL? Or do we have to include Koln as well (Cologne to you Anglophiles) to make  them happy?:-)  

doc: 1934 topic: 10

See subject. An opportunity for sales-people (-persons? -entities?).  I am looking for a commercial/PD graphics editor with fairly limited abilities that runs under X and preferably uses Motif widgets

doc: 1935 topic: 9

Yes, there is:  consumer confusion.  In the early 80's with the fuel crisis, etc., everyone wanted better fuel mileage.  Diesel fuel was the cheapest fuel available and usually p

  For that matter, it shouldn't be that difficult to design a black box that gives off EMR similar to a monitor with gibberish on the screen....     

doc: 3035 topic: 9

I've had my Subaru Liberty 4WD station wagon for about 8 months now. Saying I'm happy with it would be an understatement!  Just great. Well built, handles beautifully, plenty of power. I've only had i

doc: 3036 topic: 16

Darryl Strawberry's moon shots were fun!  He can hit those high and far home runs that if he actually ran them out he'd be rounding second base by the time they landed.  We used to say that he should 

doc: 3037 topic: 1

 I had the same problem in my '90 MX-6. Luckily I had it fixed under warranty. I think they replaced a tail light gasket. Check with a dealer, it's a known problem. 

doc: 3038 topic: 0

What an anal retentive you are wimp.

doc: 3039 topic: 1

I hate to be rude, but screw the seating chart, post the stadium instead. 

doc: 3040 topic: 6

Hi netters! 	I often have troubles with my 

doc: 4071 topic: 19

 Oh, *really*???   I know that when working in Alberta, Ontario and Quebec, I was aware that I was paying for health insurance - e.g., in Toronto, OHIP fees were listed seperately on my pay stub.  Whi

doc: 4072 topic: 0

 The speed limit on commuter tracks in the northeast is 120MPH.  We already have something that resembles high speed rail in this country and it requires massive government subsidies.  We don't need a

doc: 4073 topic: 9

      Try graPHIGS from IBM... It is an excellent package! :^)  Doug  

doc: 4074 topic: 6

:     Help!! I need code/package/whatever to take 3-D data and turn it into : a wireframe surface with hidden lines removed. I'm using a DOS machine, and : the code can be in ANSI C or C++, ANSI Fortr

doc: 4075 topic: 10

..continuing on my build problems, I got stuck here build xterm...  gcc -fpcc-struct-return -o xterm main.o input.o charproc.o cursor.o util.o tabs.o  screen.o scrollbar.o button.o Tekproc.o misc.o  V

doc: 4076 topic: 1


If you could not tell which one had MSG, why restaurants bother to use it at all?   If you can taste the difference, psychological reaction might play a role.  The fact is, MSG is part of natural subs

doc: 5159 topic: 10

HELP, PROBLEM 486/33MHZ HANGS IN EXTENDED MODE TRYING TO ACCESS DRIVES A: OR B: , SOMETIMES IT WILL DO DIR , SOMETIMES WILL HANG ON ACCESS SOMETIMES WILL WHEN TYING A TEXT FILE.  HARDWARE: AMERICAN ME

doc: 5160 topic: 9

 	I agree that a fully-loaded SL2 would come close in price to a LOWER-END Ford Taurus.  A FULLY-LOADED Taurus, on the other hand, would still be substantially more expensive than even the most glitzy

doc: 5161 topic: 13

10).     A 256K DRAM chip is a 256 kilobit chip whereas a 256K SIMM is a 256 kilobyte memory module. The SIMM is a PCB with a 30 pin connector edge and on the SIMM are 8 256 kilobit DRAM chips (making

doc: 5162 topic: 5

                         *Paranoia part deleted.*

doc: 5163 topic: 8

Hello,  	I remember running across an 


Well thank you dennis for your as usual highly detailed and informative  posting.     The question i have about the proton, is  could it be  handled at one of KSC's spare pads, without major  malfunct

doc: 6245 topic: 10

SIGKIDS CALL FOR PARTICIPATION SIGKids Research Showcase is where learning is hip.  Pushing the edge in education, computer graphics, and new technologies, the SIGKids Research Showcase will provide S

doc: 6246 topic: 8

  nice theory.  too bad the MR2's never came with a four cylinder over 2.0 liters.  More like 1.6.  Or did they? were the nonturbo MR2II's  2.2 or some such?  I also understand that anyone using balan

doc: 6247 topic: 16

    This is known as the Savard syndrome - and we are talking Denis, not Serge. No team will ever win squat with the likes of Denis Savard in their lineup.   They could tell Savard to stay home and wa

doc: 6248 topic: 0



doc: 6249 topic: 1

  Yes, but in a fairly reproducible way. -40 is only a smidgen of the distance to absol

 I believe this is a just another of way of expressing the basic truth "All things were created by him and FOR him." (emphasis mine)  Col. 1:16 , Rev. 4:11. If you and I have been created for God, nat

doc: 7297 topic: 8

Here's an easy question for someone who knows nothing about baseball...     What city do the California Angels play out of?    --  Richard J. Rauser        "You have no idea what you're doing." rauser

doc: 7298 topic: 6

The real question here in my opinion is what Motorola processors running system 7 on a MAC are comparable to what Intel processors running Windows on a PC?  I recall there being a conversation here th

doc: 7299 topic: 15

  _Cycle World_ puts one out, but I'm sure it's not very objective.  Try talking with dealers and the people that hang out there, as well as us.  We love to give advice.   Most of the bigger banks hav

doc: 7300 topic: 1

TEXT 44                         bhogaisvarya-prasaktanam                            tayapahrta-cetasam         


...and in San Francisco recently, some of our finest examples of humanity poured oil over a road so that vehicles going uphill would suddnely become immobile, and then they would walk right up to the 

doc: 8302 topic: 18

 This has been discussed before, by several people, on this net.  The statement is attributable either to Hajj Amin al-Husseini, former Grand Mufti of Jerusalem - and the leader of the Palestinian dea

doc: 8303 topic: 6

Hi net! Due to further investigation I would like to study the following article: 	Peterson, "Ray tracing general B-Splines", 	Proc. ACM Mountain Regional Conference, April 1986 Unfortunately I didn't

doc: 8304 topic: 6

Hi.       Well, I really hate to make a decision, but recently, I have to choose     whether stacker 3.0 or dos 6.0 with double space for my poor HD.     I am using windwos 3.1 and I hope what I choos

doc: 8305 topic: 16

   Hmmmm, I'm not sure this is true.  According to Mike Lang and good old Stagie, along with the rest of the 


  Wow, the WWII channel did something not-WWII?   The graphics capabilities of the computers were very faked for movie  audiences who have not ability or patience with numbers.  The book was more  rea

doc: 9332 topic: 19

On 20-Apr-93 in Don Cherry - Coach's Corner..  This clip was shown on local news in Pittsburgh last night (KDKA), complete with animated sarcasm by the sportscaster.  It's the second time Cherry has b

doc: 9333 topic: 6

Hi there,  We are running a 120 node Token ring with Windows 3.1 and Novell 3.11.  Every once in a while, we run into "The Black Screen of Death", a phrase coined by Robert X. Cringely in a recent Inf

doc: 9334 topic: 15

On the question, "Does God hear the prayers of sinners?" we need to distinguish.  If we say that He never hears the prayers of any who have sinned, we make pointless all prayers by anyone born less th

doc: 9335 topic: 7

 >specific objections that don't sound frighteningly technical.

doc: 9336 topic: 8

 I'd personally prefer B


doc: 10349 topic: 8

           Well, I don't think your query was exactly polite, but I will TRY to give you a polite responce.  Something atypical of the net, but here it goes.         Black is a descriptive adjective t

doc: 10350 topic: 15

  Christian  Slater, only gota  cameo on ST6,    and besides.  Maybe she can't act:-)

doc: 10351 topic: 8

 a lot of batters lean in when pitches come.  rickey's crouch tends to exaggerate that, i think.  "a great player to watch if you forget who he is" - "unbiased"... hmmm...

doc: 10352 topic: 16

  True, coach Matikainen is ready to keep a spot for Teemu all the way until the medal games. He wants Teppo Numminen, too. And Kurri, but for them the spots cannot be left open for too long. Esa Tikk

doc: 10353 topic: 16

   --Minnesota definitely deserves an NHL franchise!!!  You'll see the Minnesota Whalers pretty soon, so fear not Minnesota fans.  No Norm Green, 'cept for the team color (sorry, bad pun!)     --What 

doc: 10354 topic: 16

   

In [49]:
# As an example the first document (at index 0) is found to be 46% topic 7 and 35% topic 18.
# These values could be utilised to give a more nuanced classification of a document
doc_topic[0]

array([ 0.00990577,  0.00990577,  0.00990577,  0.00990577,  0.00990577,
        0.00990577,  0.00990577,  0.46679051,  0.00990577,  0.00990577,
        0.00990577,  0.00990577,  0.00990577,  0.00990577,  0.00990577,
        0.00990577,  0.00990577,  0.00990577,  0.3549057 ,  0.00990577])

In [None]:
# The above two cells can be used with the NMF model too. Just change the lda to nmf and tfidf to tf. Try it below this comment
# and compare the results