# LSA  #

s

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.datasets import fetch_20newsgroups

In [5]:
raw_data = fetch_20newsgroups(shuffle=True, remove=('headers', 'footers', 'quotes'), random_state=1379)

In [14]:
#Get target names from this dataset:
raw_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [28]:
#Here we normalize the text and make it lower case.
from cucco import Cucco
from cucco.config import Config
norm_esp = Cucco(Config(language='en'))
normalized_text = pd.Series(raw_data.data).str.replace("[^a-zA-Z#]", " ")\
    .apply(lambda row: norm_esp.normalize(row.lower()))
normalized_text.head(5)

0    wrote response dlecoint garnet acns fsu darius...
1    soner yamen responded article kr nic umass bur...
2    didnot master wisdom clling deserve confuse ha...
3    driver bernoulli cartridge dataframe xp hard d...
4    explore interesting paragraph point point sent...
dtype: object

In [21]:
#How many unique words are there?
len(set(word for word in " ".join(row for row in normalized_text).split()))

105817

In [29]:
#Create tf-idf vectors
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(normalized_text)
X.shape

(11314, 72767)

In [53]:
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=8, algorithm='randomized', n_iter=100, random_state=1379)
svd_model.fit(X)
len(svd_model.components_)

8

In [54]:
transformed = pd.DataFrame(index=raw_data.target, columns=['Topic'+str(i) for i in range(len(svd_model.components_))],
                           data=svd_model.transform(X))
transformed['text'] = normalized_text
transformed.sample(n=8, random_state=192)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,text
7,1.065897e-06,0.05734,0.415979,-0.194606,-0.397748,-0.042633,0.569785,-0.224266,attorney general determined past hopes king be...
4,0.002013044,0.200812,0.932175,-0.337722,-0.742314,-0.038048,0.023652,-0.42671,explore interesting paragraph point point sent...
7,0.0002263821,0.067773,0.478969,-0.219882,-0.557016,-0.049563,0.485923,-0.286313,attorney general determined past hopes king be...
15,0.0001774488,0.129708,0.94688,-0.423604,-0.731114,-0.07398,1.001722,-0.78862,severe problem news headers ftp cs purdue pub ...
13,1.797756e-07,0.004846,0.041772,-0.019584,-0.009806,-0.002825,0.014784,-0.025032,homeland nagarno karabagh armenians homeland t...
3,0.001499249,0.076623,0.384018,-0.177681,-0.425281,-0.002296,-0.19135,-0.220382,driver bernoulli cartridge dataframe xp hard d...
16,7.545856e-07,0.050342,0.400279,-0.182096,-0.272544,-0.03266,0.437838,0.120417,results mathematica test posted newsgroup test...
5,2.482944e-05,0.040499,0.310425,-0.142802,-0.157059,-0.044304,-0.183454,-0.190054,alright ignore delved bit deeper xkeyevent fou...


In [82]:
#We need to normalize the data:
from sklearn.preprocessing import normalize
normalized = normalize(transformed.values[:,:-1])

Usually in this form the coefficients are hard to interpret. But we can use K-means clustering in this low dimensional space to find groups of products that are close together.

In [97]:
from sklearn.cluster import KMeans

clustering = KMeans(n_clusters=20, random_state=1389)
clustering.fit(normalized)
summary = pd.DataFrame({'group': raw_data.target, 'prediction': clustering.labels_})

In [93]:
import unittest
class TestImpurity(unittest.TestCase):

    def test_impurity(self):
        test_array = pd.Series([1,2,3,5,5])
        self.assertEqual(impurity(test_array), 0.6)
unittest.main(argv=[''], verbosity=0, exit=False)

ERROR: test_impurity (__main__.TestImpurity)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-93-068fced383c6>", line 6, in test_impurity
    self.assertEqual(impurity(test_array), 0.6)
  File "<ipython-input-92-d3b73ae07e66>", line 2, in impurity
    return values.value_counts.max()/ values.count()
AttributeError: 'function' object has no attribute 'max'

----------------------------------------------------------------------
Ran 1 test in 0.001s

FAILED (errors=1)


<unittest.main.TestProgram at 0x1f3bafe72e8>

In [96]:
def impurity(values: pd.Series):
    return 1-values.value_counts().max()/ values.count()
unittest.main(argv=[''], verbosity=0, exit=False)    

----------------------------------------------------------------------
Ran 1 test in 0.002s

OK


<unittest.main.TestProgram at 0x1f3bb275e80>

In [101]:
#Now we calculate impurity for all newsgroups
for group in range(20):
    print(f'{raw_data.target_names[group]}: {impurity(summary[summary.group == group].prediction)}')

alt.atheism: 0.725
comp.graphics: 0.7842465753424658
comp.os.ms-windows.misc: 0.8020304568527918
comp.sys.ibm.pc.hardware: 0.7508474576271187
comp.sys.mac.hardware: 0.78719723183391
comp.windows.x: 0.6913996627318718
misc.forsale: 0.7982905982905983
rec.autos: 0.7828282828282829
rec.motorcycles: 0.8260869565217391
rec.sport.baseball: 0.7437185929648241
rec.sport.hockey: 0.6516666666666666
sci.crypt: 0.8084033613445378
sci.electronics: 0.868020304568528
sci.med: 0.7962962962962963
sci.space: 0.8195615514333896
soc.religion.christian: 0.5141903171953255
talk.politics.guns: 0.7893772893772893
talk.politics.mideast: 0.7287234042553192
talk.politics.misc: 0.7204301075268817
talk.religion.misc: 0.6896551724137931


The impurity measure of 19/20 == 0.95 implies completely random data. However we see that our clustering did not improve impurity measure significantly.