In [1]:
import numpy as np
import pandas as pd

- Principal Component Analysis (PCA)
- Latend Semantic Analysis (LSA) is based on Singular value decomposition (SVD)
- Linear Discriminant Dnalysis (LDA)
- Latent Dirichlet Allocation (LDiA)
<br>
- Non-negative Matrix Factorization (NMF)

- <b>U matrix: the term-topic matrix or h: word belonging to topic</b><br>
-> is called "left singular vectors” because it contains row vectors that should be multiplied by a matrix of column vectors from the left <br>
-> is the cross-correlation between words and topics based on word co-occurrence in the same document.<br><br>

-  <b>Sigma or S matrix: the topic “singular values”</b><br>
-> tells you how much information is captured by each dimension in your new semantic (topic) vector space<br><br>

- <b>V matrix: how often documents use the same topics or w: document belonging to topic</b><br>
-> contains the “right singular vectors” as the columns of the document-topic matrix. <br>
-> shared meaning between documents<br><br>
(https://livebook.manning.com/book/natural-language-processing-in-action/chapter-4/point-7616-203-203-0)




### load data

In [2]:
data_raw=pd.read_csv('data/sms-spam.csv', sep=',',index_col=[0]).dropna()
data_raw

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
4832,1,This is the 2nd time we have tried 2 contact u...
4833,0,Will ü b going to esplanade fr home?
4834,0,"Pity, * was in mood for that. So...any other s..."
4835,0,The guy did some bitching but I acted like i'd...


In [3]:
data_raw.spam.sum()

638

### tokenize and get frequencies

In [4]:
from nltk.tokenize.casual import casual_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(tokenizer=casual_tokenize, stop_words='english')
tfidf_tdm=pd.DataFrame(tfidf.fit_transform(data_raw['text']).todense(), columns = tfidf.get_feature_names())
tfidf_tdm

Unnamed: 0,!,"""",#,#150,#5000,$,%,&,',(,...,ü'll,–,—,‘,’,“,…,┾,〨ud,鈥
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.106887,0.136572,0.131767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4832,0.100932,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4833,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4834,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4835,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# get only the terms from voacabulary as tuple
col_nums, terms=zip(*(zip(tfidf.vocabulary_.values(),tfidf.vocabulary_.keys())))
tfidf.vocabulary_.keys()



In [7]:
#total number of documents
len(data_raw.text)

4837

In [8]:
# total number of words - cave, stopwords have not been removed
sum(len(casual_tokenize(t)) for t in data_raw.text)

103260

In [9]:
#1st parameter: mean number of words per document
sum(len(casual_tokenize(t)) for t in data_raw.text)/len(data_raw.text)

21.34794293983874

### Analysis

#### PCA: Principal Component Analysis

In [10]:
from sklearn.decomposition import PCA

In [11]:
pca = PCA(n_components=16)
pca = pca.fit(tfidf_tdm)

In [12]:
pca.singular_values_

array([8.10498952, 7.37905136, 6.74229307, 6.27760889, 6.08017988,
       5.66493984, 5.61880696, 5.14082811, 5.06280531, 4.9164906 ,
       4.78445923, 4.57074409, 4.45137488, 4.22746975, 4.20854228,
       4.13638348])

In [13]:
#term-topic frequency U or h
pca_h=pd.DataFrame(pca.components_, columns=tfidf.get_feature_names())
pca_h

Unnamed: 0,!,"""",#,#150,#5000,$,%,&,',(,...,ü'll,–,—,‘,’,“,…,┾,〨ud,鈥
0,-0.008925,0.010068,-0.000787,-0.000238,-0.000409,0.002071,-0.001282,-0.01052,-0.010621,-0.001688,...,0.002851,0.000508,-1.847029e-06,-0.001462,-0.001059,-0.000691,-0.000406,0.001319,0.001319,0.001319
1,-0.12034,-0.013612,-0.000926,2e-05,-0.000307,0.002466,0.00056,-0.018211,0.002942,-0.012448,...,-3.2e-05,-0.001988,3.12142e-06,-0.007103,0.000326,0.000174,-0.004427,0.00026,0.00026,0.00026
2,-0.026862,0.008251,-0.000809,-0.000176,-5.7e-05,0.004427,-0.001084,0.008369,0.010946,0.003141,...,-0.001376,5.2e-05,8.45132e-07,0.003813,0.000378,0.000131,0.003507,-0.000264,-0.000264,-0.000264
3,-0.272803,-0.066581,0.002451,-0.000316,-0.002084,-0.003766,-0.000572,-0.054523,-0.008863,-0.01488,...,0.000876,0.000218,1.036952e-05,-0.002916,-0.000304,-0.000689,-6e-06,8e-06,8e-06,8e-06
4,0.560218,-0.008118,0.000253,0.0006,0.002083,0.002646,-0.000583,0.044022,0.015193,0.012998,...,-0.00073,-0.001143,9.459191e-06,0.009603,0.000836,0.001027,0.004969,0.000896,0.000896,0.000896
5,0.151772,0.049894,-0.000635,0.0004,0.000272,-0.005703,-0.000619,0.049968,0.009686,0.007022,...,-0.00322,-0.00096,7.849629e-06,0.005388,0.000321,0.000137,0.001586,-0.000897,-0.000897,-0.000897
6,0.199077,-0.042648,0.000373,-8.2e-05,0.000968,-0.001698,-0.001541,-0.010233,0.001063,0.00226,...,-0.000154,0.004029,-8.541571e-06,0.01837,-0.000204,0.000234,0.00334,2.2e-05,2.2e-05,2.2e-05
7,0.161359,0.062536,0.001692,-0.000422,-0.000788,-0.001057,-0.002071,0.009446,0.042585,-0.019365,...,-0.002542,-0.002423,2.498494e-05,0.008733,-9.4e-05,-0.000238,0.006556,0.000914,0.000914,0.000914
8,0.441878,-0.04555,-0.001974,0.000218,0.000352,-0.006314,-0.004041,-0.020504,-0.015379,-0.025237,...,-0.001459,-0.004387,1.022561e-05,-0.001813,-0.001106,-0.000264,-0.001435,0.000537,0.000537,0.000537
9,-0.269911,-0.026747,0.001105,0.000197,-0.001023,0.002145,0.002283,0.025673,0.005031,0.023303,...,-0.002172,0.002064,-4.479797e-06,0.001932,0.001237,0.000216,0.002178,-0.000978,-0.000978,-0.000978


In [14]:
#V or w: document belonging to topic
pca_w=pd.DataFrame(pca.transform(tfidf_tdm))
pca_w

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.185094,0.049955,0.056363,-0.073290,-0.041905,-0.007562,-0.080031,0.063549,-0.044183,-0.030211,-0.040299,-0.012381,-0.000317,0.066072,0.027406,-0.026731
1,0.340653,0.256220,0.003327,0.034277,0.013262,0.110380,0.025315,-0.021785,0.019101,0.054272,-0.009690,0.029491,-0.018842,-0.031019,-0.017811,-0.002668
2,-0.039344,-0.019761,0.000071,-0.072602,0.047106,0.016022,0.010765,-0.084987,-0.062222,0.062266,-0.019459,-0.141838,-0.027321,-0.048766,0.042469,0.036068
3,0.286352,0.149240,-0.041389,0.041452,0.091798,0.097965,-0.219490,-0.030634,0.006200,0.084516,-0.025167,0.067420,-0.017186,-0.008827,-0.036288,-0.016223
4,0.011463,-0.082201,0.092112,-0.015334,0.011834,-0.020791,0.028091,0.053372,-0.085603,0.059805,0.024802,0.045106,-0.018468,0.017266,-0.020318,-0.023975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4832,-0.146412,0.064817,-0.056118,0.060890,0.200010,0.212028,-0.133920,0.012382,0.074878,0.092487,-0.018410,-0.140746,0.061937,0.060912,0.022344,0.002637
4833,0.086300,-0.072729,-0.096108,0.075633,-0.057935,-0.108646,0.017420,-0.073670,-0.018987,-0.141645,0.153516,-0.090595,0.117365,0.109011,0.128277,0.064933
4834,0.111894,0.017436,0.037620,0.088616,0.050575,-0.083185,-0.004257,0.157220,-0.024116,-0.023018,-0.057043,-0.035216,-0.041735,-0.035323,0.013527,0.014455
4835,-0.000176,-0.037836,-0.007952,-0.044839,-0.000710,-0.039495,0.028463,-0.067920,-0.062693,0.025013,0.000583,-0.006858,-0.043051,-0.006200,-0.028925,0.024859


In [15]:
#to which topics does a word belong to?
mytopics = pca_h['!'.split()].round(3)*100
mytopics.T.sum()

0     -0.9
1    -12.0
2     -2.7
3    -27.3
4     56.0
5     15.2
6     19.9
7     16.1
8     44.2
9    -27.0
10     4.9
11    10.3
12    15.4
13    20.1
14   -13.7
15     0.3
dtype: float64

#### SVD: Singular Value Decomposition/ LSA: Latend Semantic Analysis
- linear mathematical distribution
- can be used with sparce matrices
- no vocabulary_, get_feature_names()

In [16]:
from sklearn.decomposition import TruncatedSVD

In [17]:
svd = TruncatedSVD(n_components=16, n_iter=100)
svd_topic_vectors = pd.DataFrame(svd.fit_transform(tfidf_tdm.values))
svd_topic_vectors.round(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.136,0.193,-0.034,0.056,0.021,0.110,-0.066,0.018,-0.067,-0.029,-0.038,0.001,-0.008,0.056,0.027,-0.026
1,0.191,0.365,-0.204,0.003,-0.007,-0.020,0.108,0.053,0.022,0.055,-0.009,0.031,-0.012,-0.030,-0.005,0.018
2,0.083,-0.041,0.004,-0.000,0.073,0.007,0.023,0.020,-0.007,0.030,-0.044,-0.170,-0.052,-0.024,-0.077,-0.002
3,0.192,0.307,-0.091,-0.041,0.070,-0.021,-0.044,0.242,0.009,0.086,-0.022,0.068,-0.000,-0.017,0.018,0.030
4,0.069,0.007,0.077,0.092,0.018,-0.001,-0.001,-0.036,-0.103,0.057,0.022,0.027,-0.001,0.015,0.012,0.011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4832,0.331,-0.118,0.012,-0.054,0.172,-0.032,0.088,0.199,0.051,0.103,-0.022,-0.105,0.006,0.021,0.098,-0.109
4833,0.108,0.087,0.090,-0.095,-0.095,-0.044,-0.071,-0.051,0.030,-0.155,0.140,-0.143,0.109,0.144,-0.046,-0.084
4834,0.220,0.128,0.042,0.039,0.009,-0.053,-0.078,-0.079,-0.098,-0.003,-0.045,0.029,-0.066,-0.057,0.022,-0.059
4835,0.041,-0.006,0.016,-0.008,0.010,-0.009,-0.009,-0.017,-0.019,0.002,-0.014,-0.045,-0.026,0.004,-0.029,0.069


In [18]:
svd.singular_values_

array([10.79931695,  8.09052189,  7.23495689,  6.74223565,  6.11199117,
        5.97815709,  5.64336163,  5.55900368,  5.08213819,  4.92444752,
        4.7885953 ,  4.63517396,  4.47280897,  4.24870885,  4.22463554,
        4.2131527 ])

In [19]:
svd.components_

array([[ 1.76971469e-01,  3.47439545e-02,  1.94738308e-03, ...,
         5.17984292e-04,  5.17984292e-04,  5.17984292e-04],
       [-4.52941673e-04,  1.15587324e-02, -6.24913905e-04, ...,
         1.38978260e-03,  1.38978260e-03,  1.38978260e-03],
       [ 1.60585401e-01,  1.71415481e-02,  1.76232482e-03, ...,
         3.09749404e-05,  3.09749404e-05,  3.09749404e-05],
       ...,
       [ 1.38305866e-01, -1.01899066e-01,  8.87571175e-04, ...,
        -1.07582472e-03, -1.07582472e-03, -1.07582472e-03],
       [ 1.67485617e-01, -2.22077132e-01, -1.29507362e-03, ...,
        -2.62491045e-04, -2.62491045e-04, -2.62491045e-04],
       [ 1.39043378e-01,  2.89705520e-02,  1.04587237e-03, ...,
         4.75984744e-04,  4.75984744e-04,  4.75984744e-04]])

#### LDiA: Latend Dirichlet Allocation
- dirichlet distribution of word frequencies
- uses raw BOW count vectors rather than TF-IDF vectors

##### BOW

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(tokenizer=casual_tokenize, stop_words='english')
cv_tdm = pd.DataFrame(cv.fit_transform(data_raw.text).todense(), columns = cv.get_feature_names())

In [23]:
data_raw.loc[0]['text']

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [24]:
cv_tdm.loc[0][cv_tdm.loc[0]>0]

,            1
..           1
...          2
amore        1
available    1
buffet       1
bugis        1
cine         1
crazy        1
e            1
got          1
great        1
jurong       1
la           1
n            1
point        1
wat          1
world        1
Name: 0, dtype: int64

##### Topic vectors with LDiA

In [25]:
from sklearn.decomposition import LatentDirichletAllocation

In [26]:
ldia = LatentDirichletAllocation(n_components = 16, learning_method = 'batch')
ldia = ldia.fit(cv_tdm)
ldia.components_.shape

(16, 8961)

In [27]:
# h: term-document distribution
ldia_h = pd.DataFrame(ldia.components_, columns=sorted(cv.vocabulary_))
#or
#ldia_h= pd.DataFrame(ldia.components_.T, index=sorted(cv.vocabulary_)) 
# or ldia_h.T

In [28]:
#get top terms of a topic
ldia_h.iloc[1].sort_values(ascending=False)[:10]

.        314.174119
,        280.856320
?        180.052133
!        139.716486
u        114.803672
ur        96.424776
free      96.270524
text      83.235022
reply     67.911321
new       64.764514
Name: 1, dtype: float64

In [None]:
# get top 3 words per topic
frequent_words={}
for i in range(ldia.n_components):
    frequent_words[i]=ldia_h.iloc[i].sort_values(ascending=False).index[:3].tolist()
frequent_words

In [None]:
# w: topic per document
ldia_w= pd.DataFrame(ldia.transform(cv_tdm))
ldia_w

In [None]:
#get dominant topic per document
ldia_w['dominant']=np.argmax(ldia_w.values, axis=1)
ldia_w

#### NMF: Non-n egative Matrix Factorization
- accepts tf-idf

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components = 16)

In [None]:
h_nmf = pd.DataFrame(nmf.fit(tfidf_tdm).components_, columns=tfidf.get_feature_names())
h_nmf

In [None]:
#top_word per topic
frequent_words={}
for i in range(nmf.n_components):
    frequent_words[i]=h_nmf.iloc[i].sort_values(ascending=False).index[:3].tolist()
frequent_words    

In [None]:
w_nmf=pd.DataFrame(nmf.transform(tfidf_tdm))
w_nmf

In [None]:
# dominant topic per document
w_nmf['dominant']=np.argmax(w_nmf.values, axis=1)
w_nmf