In [None]:
## Loading the  required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
## A toy example- a list of 5 documents are considered as corpus(means collection of documents)
docs=['Romeo Juliet','Juliet O happy dagger','Romeo died dagger','Live free or die New-Hampshire motto','Did you know New-Hampshire New-England','Romeo in Japan']


In [None]:
## Converting text to a structured form(Rows being documents and columns are the )
tfidf=TfidfVectorizer()
mat=tfidf.fit_transform(docs)
mat


<6x18 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [None]:
#mat is a sparse matrix
mat

<6x18 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [None]:
mat.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.7640961 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.64510243, 0.        ],
        [0.5355058 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.65304446, 0.        , 0.        ,
         0.5355058 , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.55902156, 0.        , 0.        , 0.68172171, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.47196441, 0.        ],
        [0.        , 0.        , 0.39699901, 0.        , 0.        ,
         0.39699901, 0.32554487, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.39699901, 0.39699901, 0.32554487,
         0.39699901, 0.        , 0

In [None]:
#to view the doc term matrix
Data =pd.DataFrame(mat.todense(),columns=tfidf.get_feature_names_out())
Data=pd.concat([pd.Series(docs),Data],axis=1)


In [None]:
Data

Unnamed: 0,0,dagger,did,die,died,england,free,hampshire,happy,in,japan,juliet,know,live,motto,new,or,romeo,you
0,Romeo Juliet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.764096,0.0,0.0,0.0,0.0,0.0,0.645102,0.0
1,Juliet O happy dagger,0.535506,0.0,0.0,0.0,0.0,0.0,0.0,0.653044,0.0,0.0,0.535506,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Romeo died dagger,0.559022,0.0,0.0,0.681722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471964,0.0
3,Live free or die New-Hampshire motto,0.0,0.0,0.396999,0.0,0.0,0.396999,0.325545,0.0,0.0,0.0,0.0,0.0,0.396999,0.396999,0.325545,0.396999,0.0,0.0
4,Did you know New-Hampshire New-England,0.0,0.368552,0.0,0.0,0.368552,0.0,0.302218,0.0,0.0,0.0,0.0,0.368552,0.0,0.0,0.604436,0.0,0.0,0.368552
5,Romeo in Japan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.635091,0.635091,0.0,0.0,0.0,0.0,0.0,0.0,0.439681,0.0


In [None]:
##lets consider a test query for which revelant articles from the cropus should be
test = 'die dagger'
t1=tfidf.transform([test])

In [None]:
t1.todense()

matrix([[0.6340862 , 0.        , 0.77326237, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ]])

In [None]:
sim1=cosine_similarity(np.asarray(t1.todense()),np.asarray(mat.todense()))

In [None]:
sim1

array([[0.        , 0.33955684, 0.35446786, 0.3069844 , 0.        ,
        0.        ]])

In [None]:
##to understand the hidden relations we do matrix factorization(SVD) - we get 3 en
##1.docs - topic matrix,topics-topics strenght and word-topics
##since singular values are egien value which caputures information - lower egien values
##in this case we considered 2 eige values
tsvt=TruncatedSVD(n_components=2)


In [None]:
## Here we perform the matrix factorization on document term matrix, to explore
lsa=tsvt.fit_transform(mat)  ## las will have(as mentioned earlier) topics strength


In [None]:
## BY default the output here reveals documents to topics i.e what topics are talke
lsa


array([[ 7.91975656e-01, -9.39734015e-18],
       [ 6.77369665e-01, -2.98383347e-16],
       [ 6.90765217e-01,  4.88562019e-16],
       [-1.95945655e-16,  8.04722430e-01],
       [ 5.51276105e-17,  8.04722430e-01],
       [ 4.69548168e-01, -6.49377392e-17]])

In [None]:
from IPython.core.display import display
## A better representation of the above one
Data=pd.DataFrame(lsa,columns=['Topic1','Topic2'])
Data['Original']=docs
display(Data[['Original','Topic1','Topic2']])


Unnamed: 0,Original,Topic1,Topic2
0,Romeo Juliet,0.7919757,-9.39734e-18
1,Juliet O happy dagger,0.6773697,-2.983833e-16
2,Romeo died dagger,0.6907652,4.88562e-16
3,Live free or die New-Hampshire motto,-1.959457e-16,0.8047224
4,Did you know New-Hampshire New-England,5.5127610000000007e-17,0.8047224
5,Romeo in Japan,0.4695482,-6.493774e-17


In [None]:
print(tsvt.components_)
print(tsvt.components_.shape)


[[ 4.19853910e-01  4.17213351e-17 -9.19884368e-17  2.64009102e-01
   4.63958603e-17 -9.24553294e-17 -4.18661065e-17  2.47998930e-01
   1.67184969e-01  1.67184969e-01  5.42629287e-01  4.63632373e-17
  -9.23942622e-17 -9.23942622e-17  2.91488811e-18 -9.23942622e-17
   5.84953201e-01  4.46488426e-17]
 [ 1.02839840e-16  2.28993171e-01  2.46668290e-01  4.37327401e-16
   2.28993171e-01  2.46668290e-01  3.90049203e-01 -3.36154825e-16
  -1.48625516e-16 -1.48625516e-16 -2.50101176e-16  2.28993171e-01
   2.46668290e-01  2.46668290e-01  5.77826879e-01  2.46668290e-01
   2.81666887e-16  2.28993171e-01]]
(2, 18)


In [None]:
dicti=tfid.get_feature_names_out()
dicti


array(['dagger', 'did', 'die', 'died', 'england', 'free', 'hampshire',
       'happy', 'in', 'japan', 'juliet', 'know', 'live', 'motto', 'new',
       'or', 'romeo', 'you'], dtype=object)

In [None]:
encoding_matrix=pd.DataFrame(tsvt.components_,index=['Topic1','Topic2'],columns=dicti).T
encoding_matrix


Unnamed: 0,Topic1,Topic2
dagger,0.4198539,1.028398e-16
did,4.172134e-17,0.2289932
die,-9.198844e-17,0.2466683
died,0.2640091,4.373274e-16
england,4.6395860000000005e-17,0.2289932
free,-9.245533e-17,0.2466683
hampshire,-4.186611e-17,0.3900492
happy,0.2479989,-3.361548e-16
in,0.167185,-1.486255e-16
japan,0.167185,-1.486255e-16


In [None]:
## We have a search  query 'die dagger',we have converted into a document term
t1.todense()


matrix([[0.6340862 , 0.        , 0.77326237, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ]])

In [None]:
query_vec=tsvt.transform(t1)
query_vec


array([[0.26622357, 0.19073931]])

In [None]:
t1.todense().shape


(1, 18)

In [None]:
## A calculation to show the document topics matrix matrix is created
## The document term matrix is multiplied with 
doc_trains=np.dot(mat.todense(),tsvt.components_.T)
doc_trains


matrix([[ 7.91975656e-01, -9.39734015e-18],
        [ 6.77369665e-01, -2.98383347e-16],
        [ 6.90765217e-01,  4.88562019e-16],
        [-1.95945655e-16,  8.04722430e-01],
        [ 5.51276105e-17,  8.04722430e-01],
        [ 4.69548168e-01, -6.49377392e-17]])

In [None]:
## Now get the cosine simlilarity of query to all the documnets
sim=cosine_similarity(np.asarray(query_vec),np.asarray(doc_trains))


In [None]:
## To make the array one dimensional
sim=sim.reshape(-1)
sim


array([0.81289556, 0.81289556, 0.81289556, 0.58240949, 0.58240949,
       0.81289556])

In [None]:
pd.concat([Data['Original'],pd.Series(sim)],axis=1)

Unnamed: 0,Original,0
0,Romeo Juliet,0.812896
1,Juliet O happy dagger,0.812896
2,Romeo died dagger,0.812896
3,Live free or die New-Hampshire motto,0.582409
4,Did you know New-Hampshire New-England,0.582409
5,Romeo in Japan,0.812896
