In [108]:
# you cannot load any other libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm.notebook as tqdm
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from nltk.corpus import stopwords
from collections import Counter

In [109]:
df = pd.read_csv('nytimes_data_final.csv')
df = df.drop_duplicates('text')
N = len(df)
corpus = df['text'].values

In [110]:
remove_stopwords = False
use_lemmatization = False
l2_normalize_tf_idf = False
lemmatizer = WordNetLemmatizer()

In [111]:
def calculate_similarity(q, v):
    sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
    
    return sim

In [112]:
def tokenize_doc(sent, lemma=False, remove_stopwords=False):
    # a simple tokenizer with case folding and an option to use lemmatization or remove stopwords
    sent = sent.lower()
    tokens = sent.split()
    if lemma:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stopwords.words('english')]
        
    return tokens

In [113]:
def basic_text_processing(corpus):
    # This function will go through the corpus and outputs two components
    # w2i: the mapping of a vocabular to in index. This is also our vocabulary
    # docs_in_tokens: list of extracted tokens for each document
    vocab = set()
    docs_in_tokens = []
    for doc in corpus:
        tokens = tokenize_doc(doc, lemma = use_lemmatization, remove_stopwords=remove_stopwords)
        vocab.update(set(tokens))
        docs_in_tokens.append(tokens)
    vocab = list(vocab)
    vocab.sort()
    w2i = OrderedDict()
    for i, word in enumerate(vocab):
        w2i[word] = i
    
    return w2i, docs_in_tokens

In [114]:
def calculate_idf(docs_in_tokens, w2i):
    # TASK: given the list of tokens for each document (docs_in_tokens) and the vocabulary (w2i),
    # you are asked to calculate the inverse document frequency (IDF) of each word using the formulation
    # log10(N/(df+1))
    # RETURN: all_idf vector (or a column) contains all the IDF of all words in the vocabulary
    
    all_idf = [] #initialize the list of all idf values
    
    # TODO:
    N = len(docs_in_tokens)
    w_fre = [Counter(doc) for doc in docs_in_tokens]
    for w in w2i:
        idf = 0
        for i,doc in enumerate(docs_in_tokens):
            if w_fre[i][w] > 0:
                idf += 1
        idf = np.log10(idf)
        all_idf.append(idf)
    
    return np.array(all_idf)

In [115]:
def calculate_tf(docs_in_tokens, w2i):
    # TASK: given the list of tokens for each document (docs_in_tokens) and the vocabulary (w2i),
    # you are asked to calculate the term-frequency table or matrix using the formulation:
    # tf = log10(frequency+1)
    # RETURN: tf_matrix as the term-frequency table
    
    tf_matrix = np.zeros((len(w2i), len(docs_in_tokens))) #initialize as a matrix/table of all zeros
    
    # TODO
    N = len(docs_in_tokens)
    w_fre = [Counter(doc) for doc in docs_in_tokens]
    for w in w2i:
        for i,doc in enumerate(docs_in_tokens):
            tf = w_fre[i][w]
            tf_matrix[w2i[w],i] = np.log10(tf+1)
    return tf_matrix

In [116]:
def transform(query, w2i, all_idf):
    # TASK: given a string query, you are asked to utilize the extracted vocabulary (w2i)
    # and idf value for each word to transform a query into a respective tf-idf vector
    # RETURN: tf_idf_query
    
    tf_idf_query = []
    
    #TODO
    q = [tokenize_doc(query, True, remove_stopwords=False)]
    tf_query = calculate_tf(q, w2i)
    #idf_query = calculate_idf(query, w2i)
    tf_idf_query = (tf_query*all_idf.reshape(-1,1)).reshape(-1,)
    #tf_idf_query = tf_idf_query.reshape(-1,)
    return np.array(tf_idf_query)

In [117]:
w2i, docs_in_tokens = basic_text_processing(corpus)
assert len(docs_in_tokens) == len(corpus) 


In [118]:
all_idf = calculate_idf(docs_in_tokens, w2i)
assert len(all_idf) == len(w2i) 
# if you have error in this, please check your calculate_idf function

In [119]:
tf_matrix = calculate_tf(docs_in_tokens, w2i)
assert tf_matrix.shape == (len(w2i), len(docs_in_tokens))
# if you have error in this, please check your calculate_tf function

In [120]:
tf_idf = tf_matrix * all_idf.reshape(-1,1) # final tf-idf is the multiplicatioin of tf and idf

In [121]:
if l2_normalize_tf_idf:
    from sklearn.preprocessing import normalize
    tf_idf = normalize(tf_idf, axis=0)

---

## Evaluation via Information Retrieval

In [122]:
def search(query, k):
    q = transform(query, w2i, all_idf)
    sims = []
    for i in range(tf_idf.shape[1]):
        v = tf_idf[:,i].reshape(-1,)
        sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
        sims.append(sim)
    idx = np.argsort(sims)[::-1]
    return idx[:k]

#### Let's try to search a document from the corpus via a query

In [123]:
query = "Trump and Biden"
found_idx = search(query, 10)
corpus[found_idx]

  sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))


array(['Together.', 'Sincere', 'Airy Snack Item', '‘Weather’',
       'Trump Speaks! And Speaks. And Speaks …',
       'Trump and Transgender Rights',
       'Trump Thinks He’s 2020’s ‘Law and Order’ Candidate. He’s Not.',
       'TikTok Teens and K-Pop Stans Say They Sank Trump Rally',
       'On North Korea and Iran, Bolton Blames ‘the Split Between Trump and Trump’',
       'Switching Letters, Skipping Lines: Troubled and Dyslexic Minds'],
      dtype=object)

In [124]:
found_idx

array([1555,  598,   55, 1661, 2910, 1656,  221, 2354, 2338, 1725])

#### Let's test your TF-IDF on an information retrieval task to see if the results match with when using Scikit-learn library

In [125]:
test_set = {'Trump and Biden': [598, 2299, 595, 2968, 775, 1123, 2953, 1220, 2346, 853], 'Trump Twitter': [598, 2649, 292, 1102, 2308, 196, 1315, 1283, 1034, 1012], 'Elon Musk Trump': [598, 1273, 1656, 1823, 146, 1306, 81, 127, 1188, 1664], 'Political Conflicts': [598, 964, 1598, 621, 2219, 2640, 2377, 455, 1959, 2537], 'University of Misississippi': [598, 2497, 2171, 2744, 682, 1620, 3032, 1007, 1013, 1012], 'Thai Le': [598, 401, 2721, 3032, 1008, 1015, 1014, 1013, 1012, 1011], 'covid-19 is very dangerous': [598, 2736, 521, 1712, 821, 1625, 948, 2835, 168, 253], 'Defense Secretary Will Assess How to Promote More Minorities in Military': [598, 2235, 2557, 2546, 395, 1649, 716, 152, 2195, 1441], 'When Luxury Stores Decorate Their Riot Barricades With Protest Art': [598, 2465, 382, 132, 2392, 2339, 203, 0, 1142, 212]}

In [126]:
for query in test_set:
    print(query)

Trump and Biden
Trump Twitter
Elon Musk Trump
Political Conflicts
University of Misississippi
Thai Le
covid-19 is very dangerous
Defense Secretary Will Assess How to Promote More Minorities in Military
When Luxury Stores Decorate Their Riot Barricades With Protest Art


In [128]:
avg_recall = []
for query in test_set:
    true_answers = set(test_set[query])
    found_idx = set(search(query, 10))
    recall = len(found_idx.intersection(true_answers))/len(true_answers)
    avg_recall.append(recall)
    print("'{}'".format(query), "recall =", recall)
mean_recall = np.mean(avg_recall)
print("Average Recall", mean_recall)

  sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))


'Trump and Biden' recall = 0.1
'Trump Twitter' recall = 0.2
'Elon Musk Trump' recall = 0.2
'Political Conflicts' recall = 0.7
'University of Misississippi' recall = 0.1
'Thai Le' recall = 0.6
'covid-19 is very dangerous' recall = 0.1
'Defense Secretary Will Assess How to Promote More Minorities in Military' recall = 0.2
'When Luxury Stores Decorate Their Riot Barricades With Protest Art' recall = 0.2
Average Recall 0.2666666666666667
