In [212]:
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_20newsgroups
import sys

In [3]:
#Lets download the stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [114]:
#Lets imports and try to understand data or posts on the subject politics from newsgroup 
categories = ['talk.politics.mideast']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data
#Converting the data in corpus to lower case
corpus = [x.lower() for x in corpus]

In [115]:
#Lets see the data
corpus[0]

u'from: amoss@shuldig.cs.huji.ac.il (amos shapira)\nsubject: re: final solution in palestine ?\norganization: inst. of comp. sci., hebrew university, jerusalem, israel\nlines: 30\nnntp-posting-host: shuldig.cs.huji.ac.il\nin-reply-to: ahmeda@mcrcim.mcgill.edu\'s message of sun, 25 apr 93 17:10:03 gmt\n\nahmeda@mcrcim.mcgill.edu (ahmed abu-abed) writes:\n\n|what hamas and islamic jihad believe in, as far as i can get from the arab\n|media,\n|is an islamic state that protects the rights of all its inhabitants under\n|koranic\n|law. this would be a reversal of the 1948 situation in which the jews in\n|palestine took control of the land and its (mostly muslim) inhabitants.\n\nthe borders of the jewish state as drawn by the u.n. included the areas which\ncontained mostly jews,  that\'s what the surveys and the numerous commitees\nwhere after when they visited here.\n\n|however, whoever committed crimes against humanity (torture, blowing up their\n|homes, murders,...) must be treated and tri

In [312]:
stopset = set(stopwords.words('english'))
stopset.update(['\n','ed','orin','mean','tog','gmt','edu','subject', 're','nntp','article',
                'didn','nntp','posting','reply','host','right','utsa','tar','murderous','only',
                '37','cramer','optilink','com','men','inst','org','amos','91904','people',
                'mr','said','00','like','even','one','jake','livni','uci','00','1993',
                '126','bony','000','ve','still','came','000413','0006','714','000246','11186','000246 11186','000th','0000'
                ,'24102','002811 22496','002118','003336','10198','could'])

In [313]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [314]:
X[0]

<1x319412 sparse matrix of type '<type 'numpy.float64'>'
	with 429 stored elements in Compressed Sparse Row format>

In [315]:
print X[0]

  (0, 141512)	0.0437435936686
  (0, 14230)	0.0437435936686
  (0, 24336)	0.0437435936686
  (0, 150445)	0.0437435936686
  (0, 154139)	0.0437435936686
  (0, 47306)	0.0437435936686
  (0, 220758)	0.0437435936686
  (0, 13978)	0.0437435936686
  (0, 295402)	0.0437435936686
  (0, 133694)	0.0437435936686
  (0, 127764)	0.0437435936686
  (0, 274754)	0.0437435936686
  (0, 74652)	0.0437435936686
  (0, 220774)	0.0437435936686
  (0, 289227)	0.0437435936686
  (0, 105161)	0.0437435936686
  (0, 157444)	0.0437435936686
  (0, 254816)	0.042850268688
  (0, 81441)	0.0542579379143
  (0, 60100)	0.0542579379143
  (0, 292067)	0.0542579379143
  (0, 258530)	0.0542579379143
  (0, 62219)	0.0542579379143
  (0, 310300)	0.0542579379143
  (0, 134488)	0.0542579379143
  :	:
  (0, 8013)	0.0260923648689
  (0, 272629)	0.0394991898791
  (0, 184651)	0.0259095970817
  (0, 182574)	0.0588048926031
  (0, 182685)	0.0638517036215
  (0, 19715)	0.0841023245252
  (0, 8959)	0.0252196219801
  (0, 171176)	0.0076330112169
  (0, 150348)	0.02

In [316]:
X.shape

(940, 319412)

In [317]:
lsa = TruncatedSVD(n_components=20, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=None, tol=0.0)

In [318]:
lsa.components_[0]

array([ 0.00031093,  0.00031093,  0.00031093, ...,  0.00142154,
        0.00142154,  0.00142154])

In [319]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
armenian
armenians
turkish
israel
jews
serdar
argic
armenia
serdar argic
israeli
 
Concept 1:
turkish
armenians
armenia
istanbul
russian
armenian
genocide
turkey
ankara
muslim
 
Concept 2:
istanbul
professor
professor history
ankara
osmanli
ermeni
history
foreign office
history university
new
 
Concept 3:
tartars
government
russian
tartar
percent
population
002811 22496
army
jews
would
 
Concept 4:
professor
professor history
history
history university
professor history university
002811
university
armenians
apartment
us
 
Concept 5:
turkish
turks
university
first
genocide
professor
organization
professor history
002811 22496
zuma uucp
 
Concept 6:
armenian
soviet
armenians
time
war
two
know
005019 10716 midway
israeli
us
 
Concept 7:
armenian
lines
004917 3047 news
israel
state
hojali
world
muslims
dead
ottoman
 
Concept 8:
005019
armenians
war
know
armenian
world
uucp
jews
azerbaijani
well
 
Concept 9:
004917 3047
lines
well
first
004917 3047 news
turkey
turkish
would
orga

When we look at the different concepts we can interpert that the data is something about the israel, arabs, jews,turkish,russians,tartars,armenians, genocide and something about the history .
