**Task 1: Using CountVectorizer**

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the corpus
c = {
    'Lincoln1865': 'With malice toward none...',
    'TrumpMay26': 'There is NO WAY...',
    'Wikipedia': 'In 1998, Oregon became...',
    'FortuneMay26': 'Over the last two decades...',
    'TheHillApr07': 'Trump voted by mail...',
    'KingJamesBible': 'Wherefore laying aside all malice...',
}

# Create a CountVectorizer instance
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(list(c.values()))

# Convert to DataFrame
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=c.keys())
print(df)

                1998  all  aside  became  by  decades  in  is  last  laying  \
Lincoln1865        0    0      0       0   0        0   0   0     0       0   
TrumpMay26         0    0      0       0   0        0   0   1     0       0   
Wikipedia          1    0      0       1   0        0   1   0     0       0   
FortuneMay26       0    0      0       0   0        1   0   0     1       0   
TheHillApr07       0    0      0       0   1        0   0   0     0       0   
KingJamesBible     0    1      1       0   0        0   0   0     0       1   

                ...  over  the  there  toward  trump  two  voted  way  \
Lincoln1865     ...     0    0      0       1      0    0      0    0   
TrumpMay26      ...     0    0      1       0      0    0      0    1   
Wikipedia       ...     0    0      0       0      0    0      0    0   
FortuneMay26    ...     1    1      0       0      0    1      0    0   
TheHillApr07    ...     0    0      0       0      1    0      1    0   
KingJame

**Task 2: Using custom tokenizer with CountVectorizer**

In [15]:
import spacy

# Load Spacy with English model
nlp = spacy.load('en_core_web_sm')

# Custom tokenizer function using lemmatization
def custom_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

# Create CountVectorizer with custom tokenizer
vectorizer_lemma = CountVectorizer(tokenizer=custom_tokenizer)
X_lemma = vectorizer_lemma.fit_transform(list(c.values()))

# Convert to DataFrame
df_lemma = pd.DataFrame(X_lemma.toarray(), columns=vectorizer_lemma.get_feature_names_out(), index=c.keys())
print(df_lemma)

                aside  decade  lay  mail  malice  oregon  trump  vote  way  \
Lincoln1865         0       0    0     0       1       0      0     0    0   
TrumpMay26          0       0    0     0       0       0      0     0    1   
Wikipedia           0       0    0     0       0       1      0     0    0   
FortuneMay26        0       1    0     0       0       0      0     0    0   
TheHillApr07        0       0    0     1       0       0      1     1    0   
KingJamesBible      1       0    1     0       1       0      0     0    0   

                wherefore  
Lincoln1865             0  
TrumpMay26              0  
Wikipedia               0  
FortuneMay26            0  
TheHillApr07            0  
KingJamesBible          1  




**Task 3: Compute LSA (Latent Semantic Analysis)**

In [16]:
from sklearn.decomposition import TruncatedSVD

# Perform LSA on the term-document matrix from Task 2
lsa = TruncatedSVD(n_components=3)
lsa.fit(X_lemma)
doc_word_lsa = lsa.transform(X_lemma)
word_doc_lsa = lsa.components_.T

# Vector representation of 'vote'
word_index = list(vectorizer_lemma.vocabulary_.keys()).index('vote')
vote_representation = word_doc_lsa[word_index]
print("Vector representation of 'vote':", vote_representation)

Vector representation of 'vote': [ 1.35704883e-16  6.36748730e-17 -4.82774442e-01]


**Task 4: Compute cosine similarity**

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between 'malice' and 'vote'
cosine_malice_vote = cosine_similarity(word_doc_lsa[list(vectorizer_lemma.vocabulary_).index('malice')].reshape(1, -1),
                                       word_doc_lsa[list(vectorizer_lemma.vocabulary_).index('vote')].reshape(1, -1))

# Compute cosine similarity between 'mail' and 'vote'
cosine_mail_vote = cosine_similarity(word_doc_lsa[list(vectorizer_lemma.vocabulary_).index('mail')].reshape(1, -1),
                                     word_doc_lsa[list(vectorizer_lemma.vocabulary_).index('vote')].reshape(1, -1))

print("Cosine similarity between 'malice' and 'vote':", cosine_malice_vote)
print("Cosine similarity between 'mail' and 'vote':", cosine_mail_vote)

Cosine similarity between 'malice' and 'vote': [[3.37349916e-16]]
Cosine similarity between 'mail' and 'vote': [[1.54311775e-16]]


**Task 5: Compute TF-IDF matrix using TfidfVectorizer**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
X_tfidf = tfidf_vectorizer.fit_transform(list(c.values()))

# Convert to DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=c.keys())
print(df_tfidf)

                   aside  decade       lay     mail    malice  oregon  \
Lincoln1865     0.000000     0.0  0.000000  0.00000  1.000000     0.0   
TrumpMay26      0.000000     0.0  0.000000  0.00000  0.000000     0.0   
Wikipedia       0.000000     0.0  0.000000  0.00000  0.000000     1.0   
FortuneMay26    0.000000     1.0  0.000000  0.00000  0.000000     0.0   
TheHillApr07    0.000000     0.0  0.000000  0.57735  0.000000     0.0   
KingJamesBible  0.521823     0.0  0.521823  0.00000  0.427903     0.0   

                  trump     vote  way  wherefore  
Lincoln1865     0.00000  0.00000  0.0   0.000000  
TrumpMay26      0.00000  0.00000  1.0   0.000000  
Wikipedia       0.00000  0.00000  0.0   0.000000  
FortuneMay26    0.00000  0.00000  0.0   0.000000  
TheHillApr07    0.57735  0.57735  0.0   0.000000  
KingJamesBible  0.00000  0.00000  0.0   0.521823  




**Task 6: Compute cosine similarity using TF-IDF matrix**

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between 'malice' and 'vote' using TF-IDF matrix
cosine_malice_vote_tfidf = cosine_similarity(X_tfidf[:, vectorizer_lemma.vocabulary_['malice']].reshape(1, -1),
                                             X_tfidf[:, vectorizer_lemma.vocabulary_['vote']].reshape(1, -1))

# Compute cosine similarity between 'mail' and 'vote' using TF-IDF matrix
cosine_mail_vote_tfidf = cosine_similarity(X_tfidf[:, vectorizer_lemma.vocabulary_['mail']].reshape(1, -1),
                                           X_tfidf[:, vectorizer_lemma.vocabulary_['vote']].reshape(1, -1))

print("Cosine similarity between 'malice' and 'vote' (TF-IDF):", cosine_malice_vote_tfidf)
print("Cosine similarity between 'mail' and 'vote' (TF-IDF):", cosine_mail_vote_tfidf)

Cosine similarity between 'malice' and 'vote' (TF-IDF): [[0.]]
Cosine similarity between 'mail' and 'vote' (TF-IDF): [[1.]]
