<a href="https://colab.research.google.com/github/ombhope/ma22c030_2023_pl/blob/main/Assignment%2014/MA22C030PLA14(Word_vectors).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
documents = { \
    'Lincoln1865':
    'With malice toward none, with charity for all ...' +
    'let us strive on to finish the work we are in ... ' +
    'to do all which may achieve and cherish a just and lasting peace, ' +
    'among ourselves, and with all nations.',

    'TrumpMay26':
    'There is NO WAY (ZERO!) that Mail-In Ballots ' +
    'will be anything less than substantially fraudulent.',

    'Wikipedia':
    'In 1998, Oregon became the first state in the US ' +
    'to conduct all voting exclusively by mail.',

    'FortuneMay26':
    'Over the last two decades, about 0.00006% of total ' +
    'vote-by-mail votes cast were fraudulent.',

    'TheHillApr07':
    'Trump voted by mail in the Florida primary.',

    'KingJamesBible':
    'Wherefore laying aside all malice, and all guile, and ' +
    'hypocrisies, and envies, and all evil speakings',
}

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [7]:
# Create a CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data using the vectorizer
X = vectorizer.fit_transform(documents.values())

# Create a DataFrame from the term-document matrix
term_document_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=documents.keys())

print(term_document_df)

                00006  1998  about  achieve  all  among  and  anything  are  \
Lincoln1865         0     0      0        1    3      1    3         0    1   
TrumpMay26          0     0      0        0    0      0    0         1    0   
Wikipedia           0     1      0        0    1      0    0         0    0   
FortuneMay26        1     0      1        0    0      0    0         0    0   
TheHillApr07        0     0      0        0    0      0    0         0    0   
KingJamesBible      0     0      0        0    3      0    4         0    0   

                aside  ...  voting  way  we  were  wherefore  which  will  \
Lincoln1865         0  ...       0    0   1     0          0      1     0   
TrumpMay26          0  ...       0    1   0     0          0      0     1   
Wikipedia           0  ...       1    0   0     0          0      0     0   
FortuneMay26        0  ...       0    0   0     1          0      0     0   
TheHillApr07        0  ...       0    0   0     0          0 

                00006  1998  about  achieve  all  among  and  anything  are  \
Lincoln1865         0     0      0        1    3      1    3         0    1   
TrumpMay26          0     0      0        0    0      0    0         1    0   
Wikipedia           0     1      0        0    1      0    0         0    0   
FortuneMay26        1     0      1        0    0      0    0         0    0   
TheHillApr07        0     0      0        0    0      0    0         0    0   
KingJamesBible      0     0      0        0    3      0    4         0    0   

                aside  ...  voting  way  we  were  wherefore  which  will  \
Lincoln1865         0  ...       0    0   1     0          0      1     0   
TrumpMay26          0  ...       0    1   0     0          0      0     1   
Wikipedia           0  ...       1    0   0     0          0      0     0   
FortuneMay26        0  ...       0    0   0     1          0      0     0   
TheHillApr07        0  ...       0    0   0     0          0 

In [8]:
# Create an LSA (Latent Semantic Analysis) model
lsa = TruncatedSVD(n_components=3)

# Fit the LSA model to the term-document matrix
lsa_result = lsa.fit_transform(X)

# Create a DataFrame for the LSA representations
lsa_df = pd.DataFrame(lsa_result, index=documents.keys(), columns=["LSA1", "LSA2", "LSA3"])

# Print the LSA representations of documents
print("LSA representations of documents:")
print(lsa_df)

LSA representations of documents:
                    LSA1      LSA2      LSA3
Lincoln1865     7.171422  0.291899 -2.255414
TrumpMay26      0.242132  1.681556  1.050891
Wikipedia       1.719705  3.404743  0.820250
FortuneMay26    0.278378  2.079645  1.290960
TheHillApr07    0.402493  1.639630  0.546144
KingJamesBible  4.226592 -2.270034  3.295868


In [9]:
# Find the vector representation of the word "vote"
word_index = vectorizer.vocabulary_.get("vote")
if word_index is not None:
    word_vector = lsa.components_[:, word_index]
    print("Vector representation of the word 'vote':", word_vector)
else:
    print("The word 'vote' is not in the vocabulary.")

Vector representation of the word 'vote': [0.00383711 0.07797262 0.06555871]


In [10]:
# Compute cosine similarity between 'Lincoln1865' and 'Wikipedia'
cosine_similarity_l1865_wikipedia = cosine_similarity(lsa_df.loc['Lincoln1865'].to_numpy().reshape(1, -1),
                                                       lsa_df.loc['Wikipedia'].to_numpy().reshape(1, -1))

In [11]:
# Compute cosine similarity between 'TrumpMay26' and 'Wikipedia'
cosine_similarity_trump_wikipedia = cosine_similarity(lsa_df.loc['TrumpMay26'].to_numpy().reshape(1, -1),
                                                      lsa_df.loc['Wikipedia'].to_numpy().reshape(1, -1))

In [12]:
print("Cosine similarity between 'Lincoln1865' and 'Wikipedia':", cosine_similarity_l1865_wikipedia[0, 0])
print("Cosine similarity between 'TrumpMay26' and 'Wikipedia':", cosine_similarity_trump_wikipedia[0, 0])

Cosine similarity between 'Lincoln1865' and 'Wikipedia': 0.39098119916613594
Cosine similarity between 'TrumpMay26' and 'Wikipedia': 0.8985895779312532


In [13]:
# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data to compute the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(documents.values())

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=documents.keys())

# Display the TF-IDF matrix
print(tfidf_df)

                  00006      1998    about   achieve       all     among  \
Lincoln1865     0.00000  0.000000  0.00000  0.147276  0.305882  0.147276   
TrumpMay26      0.00000  0.000000  0.00000  0.000000  0.000000  0.000000   
Wikipedia       0.00000  0.272458  0.00000  0.000000  0.188626  0.000000   
FortuneMay26    0.26865  0.000000  0.26865  0.000000  0.000000  0.000000   
TheHillApr07    0.00000  0.000000  0.00000  0.000000  0.000000  0.000000   
KingJamesBible  0.00000  0.000000  0.00000  0.000000  0.426225  0.000000   

                     and  anything       are     aside  ...    voting  \
Lincoln1865     0.362304   0.00000  0.147276  0.000000  ...  0.000000   
TrumpMay26      0.000000   0.26374  0.000000  0.000000  ...  0.000000   
Wikipedia       0.000000   0.00000  0.000000  0.000000  ...  0.272458   
FortuneMay26    0.000000   0.00000  0.000000  0.000000  ...  0.000000   
TheHillApr07    0.000000   0.00000  0.000000  0.000000  ...  0.000000   
KingJamesBible  0.673126   0.

In [14]:
# Define the word vectors for "malice," "vote," and "mail"
malice_vector = tfidf_df.loc['Lincoln1865'].values.reshape(1, -1)
vote_vector = tfidf_df.loc['Wikipedia'].values.reshape(1, -1)
mail_vector = tfidf_df.loc['TrumpMay26'].values.reshape(1, -1)

In [15]:
# Compute the cosine similarity between word vectors
cosine_malice_vote = cosine_similarity(malice_vector, vote_vector)
cosine_mail_vote = cosine_similarity(mail_vector, vote_vector)

In [16]:
print("Cosine similarity between 'malice' and 'vote' using TF-IDF:", cosine_malice_vote[0, 0])
print("Cosine similarity between 'mail' and 'vote' using TF-IDF:", cosine_mail_vote[0, 0])

Cosine similarity between 'malice' and 'vote' using TF-IDF: 0.19513415920023014
Cosine similarity between 'mail' and 'vote' using TF-IDF: 0.0758726657016838
