In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm.notebook as tqdm

In [4]:
# another dataset to test
#https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews
# df = pd.read_csv('./Reviews.csv')

### Load Data

In [12]:
df = pd.read_csv('nytimes_data_final.csv')

FileNotFoundError: [Errno 2] No such file or directory: './nytimes_data_final.csv'

In [13]:
corpus = df['text'].values

NameError: name 'df' is not defined

In [None]:
len(corpus)

### Configurations

In [None]:
lowercase = True
tokenizer = lambda x: x.split()
analyzer = 'word'
stop_words = 'english'
ngram_range = (1,1)
max_features = None
norm = 'l2'
smooth_idf = True

### The Fast Version

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(lowercase=lowercase,
                            tokenizer=tokenizer,
                            analyzer=analyzer,
                            stop_words=stop_words,
                            ngram_range=ngram_range,
                            max_features=max_features,
                            norm=norm,
                            smooth_idf=smooth_idf)

In [None]:
X = vectorizer.fit_transform(corpus)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
X.shape

In [None]:
w2i = vectorizer.vocabulary_ 

### Functions

In [None]:
def calculate_similarity(embedding, q):
    sims = []
    for i in tqdm.tqdm(range(embedding.shape[0])):
        v = embedding[i].toarray().reshape(-1,)
        q = q.reshape(-1,)
        sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(q))
        sims.append(sim)
    return sims

### Use It - Similarity among Documents

In [None]:
query = 'Trump twitter'

In [None]:
v_query = vectorizer.transform([query]).toarray()


In [None]:
v_query

In [None]:
sims = calculate_similarity(X, v_query)

In [None]:
idx = np.argsort(sims)[::-1]

In [None]:
corpus[idx[:5]]

### Visualize it

In [None]:
from sklearn.decomposition import PCA

In [None]:
#https://web.stanford.edu/class/cs224n/materials/Gensim%20word%20vector%20visualization.html
def display_pca_scatterplot(words=None, cs=None):
    idx = [w2i.get(w) for w in words]
    word_vectors = X.T[idx].toarray()
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(10,10))
    plt.scatter(twodim[:,0], twodim[:,1], s=100, edgecolors='k', c='r' if not cs else cs)
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
display_pca_scatterplot(['trump',
                         'republicans',
                         'biden', 
                         'president', 
                         'shooting', 
                         'election', 
                         'lie',
                         'twitter'])