# Document classification with LSA

In [1]:
doc1 = "Data Science Machine Learning"
doc2 = "Money fun Family Kids home"
doc3 = "Programming Java Data Structures"
doc4 = "Love food health games energy fun"
doc5 = "Algorithms Data Computers"

# combine documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]
doc_complete

['Data Science Machine Learning',
 'Money fun Family Kids home',
 'Programming Java Data Structures',
 'Love food health games energy fun',
 'Algorithms Data Computers']

In [2]:
# tf–idf or TFIDF(term frequency–inverse document frequency) is Vectorizer like GloVe and word2vec,
# is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

![tf–idf](images/tfidf.png "tf-idf")


In [3]:
X = vectorizer.fit_transform(doc_complete)

In [4]:
print(X)

  (0, 2)	0.3606383263504801
  (0, 17)	0.5384979101064753
  (0, 14)	0.5384979101064753
  (0, 12)	0.5384979101064753
  (1, 15)	0.4636932227319092
  (1, 6)	0.3741047724501572
  (1, 4)	0.4636932227319092
  (1, 11)	0.4636932227319092
  (1, 9)	0.4636932227319092
  (2, 2)	0.3606383263504801
  (2, 16)	0.5384979101064753
  (2, 10)	0.5384979101064753
  (2, 18)	0.5384979101064753
  (3, 6)	0.3393931489111758
  (3, 13)	0.4206690600631704
  (3, 5)	0.4206690600631704
  (3, 8)	0.4206690600631704
  (3, 7)	0.4206690600631704
  (3, 3)	0.4206690600631704
  (4, 2)	0.42799292268317357
  (4, 0)	0.6390704413963749
  (4, 1)	0.6390704413963749


In [5]:
#Truncated SVD (LSA) Implementation - Its a dimentionality reduction technique
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=2,n_iter=100) # n_components : number of topics we want to extract

In [6]:
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=100,
       random_state=None, tol=0.0)

In [7]:
lsa.components_

array([[ 3.35943318e-01,  3.35943318e-01,  5.84606260e-01,
        -1.14765359e-16,  3.50368019e-16, -1.14765359e-16,
         1.90082786e-16, -1.14765359e-16, -1.14765359e-16,
         3.50368019e-16,  2.68489508e-01,  3.50368019e-16,
         2.68489508e-01, -1.14765359e-16,  2.68489508e-01,
         3.50368019e-16,  2.68489508e-01,  2.68489508e-01,
         2.68489508e-01],
       [ 4.89093390e-15,  4.98647635e-15,  2.32718414e-16,
         2.80200990e-01,  3.08858703e-01,  2.80200990e-01,
         4.75249652e-01,  2.80200990e-01,  2.80200990e-01,
         3.08858703e-01, -4.51068978e-15,  3.08858703e-01,
        -5.19514974e-17,  2.80200990e-01, -5.19514974e-17,
         3.08858703e-01, -4.51068978e-15, -5.19514974e-17,
        -4.51068973e-15]])

In [8]:
terms = vectorizer.get_feature_names()
terms

['algorithms',
 'computers',
 'data',
 'energy',
 'family',
 'food',
 'fun',
 'games',
 'health',
 'home',
 'java',
 'kids',
 'learning',
 'love',
 'machine',
 'money',
 'programming',
 'science',
 'structures']

In [9]:
# displaying words classified in 2 topics
for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp) # coiombines 2 three dimentional arrays to {(1, 'one'), (3, 'three'), (2, 'two')}
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Concept 0:
data
computers
algorithms
java
programming
structures
learning
machine
science
family
 
Concept 1:
fun
family
home
kids
money
energy
food
games
health
love
 


In [10]:
explained_variance = lsa.explained_variance_ratio_.sum()
print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

  Explained variance of the SVD step: 31%


## Drawbacks
LSA is quick and efficient to use, but it does have a few primary drawbacks:

* lack of interpretable embeddings (we don’t know what the topics are, and the components may be arbitrarily positive/negative)
* need for really large set of documents and vocabulary to get accurate results
* less efficient representation

Reference:<br>
https://github.com/chrisjmccormick/LSA_Classification <br>
https://github.com/kernelmachine/pyLSA/blob/master/lsa.py
