In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm.notebook as tqdm
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from nltk.corpus import stopwords

In [9]:
df_train = pd.read_csv('./quora_train.csv')
df_test = pd.read_csv('./quora_test.csv')

In [10]:
corpus = list(df_test['question2'].values) + \
         list(df_train['question1'].values) +\
         list(df_train['question2'].values)\
                    
queries = list(df_test['question1'].values)
answers = list(df_test['question2'].values)
answers_indices = list(range(len(df_test)))

In [11]:
from collections import Counter
## THE REST OF YOUR CODE GOES HERE
remove_stopwords = False
use_lemmatization = False
l2_normalize_tf_idf = False
lemmatizer = WordNetLemmatizer()
## THE REST OF YOUR CODE GOES HERE
def calculate_similarity(q, v):
    sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
    
    return sim
## THE REST OF YOUR CODE GOES HERE
def tokenize_doc(sent, lemma=False, remove_stopwords=False):
    # a simple tokenizer with case folding and an option to use lemmatization or remove stopwords
    sent = sent.lower()
    tokens = sent.split()
    if lemma:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stopwords.words('english')]
        
    return tokens
## THE REST OF YOUR CODE GOES HERE
def basic_text_processing(corpus):
    # This function will go through the corpus and outputs two components
    # w2i: the mapping of a vocabular to in index. This is also our vocabulary
    # docs_in_tokens: list of extracted tokens for each document
    vocab = set()
    docs_in_tokens = []
    for doc in corpus:
        tokens = tokenize_doc(doc, lemma = use_lemmatization, remove_stopwords=remove_stopwords)
        vocab.update(set(tokens))
        docs_in_tokens.append(tokens)
    vocab = list(vocab)
    vocab.sort()
    w2i = OrderedDict()
    for i, word in enumerate(vocab):
        w2i[word] = i
    
    return w2i, docs_in_tokens
## THE REST OF YOUR CODE GOES HERE
def calculate_idf(docs_in_tokens, w2i):
    # TASK: given the list of tokens for each document (docs_in_tokens) and the vocabulary (w2i),
    # you are asked to calculate the inverse document frequency (IDF) of each word using the formulation
    # log10(N/(df+1))
    # RETURN: all_idf vector (or a column) contains all the IDF of all words in the vocabulary
    
    all_idf = [] #initialize the list of all idf values
    
    # TODO:
    N = len(docs_in_tokens)
    w_fre = [Counter(doc) for doc in docs_in_tokens]
    for w in w2i:
        idf = 0
        for i,doc in enumerate(docs_in_tokens):
            if w_fre[i][w] > 0:
                idf += 1
        idf = np.log10(idf)
        all_idf.append(idf)
    
    return np.array(all_idf)
## THE REST OF YOUR CODE GOES HERE
def calculate_tf(docs_in_tokens, w2i):
    # TASK: given the list of tokens for each document (docs_in_tokens) and the vocabulary (w2i),
    # you are asked to calculate the term-frequency table or matrix using the formulation:
    # tf = log10(frequency+1)
    # RETURN: tf_matrix as the term-frequency table
    
    tf_matrix = np.zeros((len(w2i), len(docs_in_tokens))) #initialize as a matrix/table of all zeros
    
    # TODO
    N = len(docs_in_tokens)
    w_fre = [Counter(doc) for doc in docs_in_tokens]
    for w in w2i:
        for i,doc in enumerate(docs_in_tokens):
            tf = w_fre[i][w]
            tf_matrix[w2i[w],i] = np.log10(tf+1)
    return tf_matrix
## THE REST OF YOUR CODE GOES HERE
def transform(query, w2i, all_idf):
    # TASK: given a string query, you are asked to utilize the extracted vocabulary (w2i)
    # and idf value for each word to transform a query into a respective tf-idf vector
    # RETURN: tf_idf_query
    
    tf_idf_query = []
    
    #TODO
    q = [tokenize_doc(query, True, remove_stopwords=False)]
    tf_query = calculate_tf(q, w2i)
    #idf_query = calculate_idf(query, w2i)
    tf_idf_query = (tf_query*all_idf.reshape(-1,1)).reshape(-1,)
    #tf_idf_query = tf_idf_query.reshape(-1,)
    return np.array(tf_idf_query)
## THE REST OF YOUR CODE GOES HERE
## THE REST OF YOUR CODE GOES HERE

In [12]:
w2i, docs_in_tokens = basic_text_processing(corpus)
assert len(docs_in_tokens) == len(corpus)

In [13]:
len(docs_in_tokens), len(w2i)

(26517, 17127)

In [14]:
all_idf = calculate_idf(docs_in_tokens, w2i)
assert len(all_idf) == len(w2i)

In [15]:
tf_matrix = calculate_tf(docs_in_tokens, w2i)
assert tf_matrix.shape == (len(w2i), len(docs_in_tokens))

In [16]:
tf_idf = tf_matrix * all_idf.reshape(-1,1)

In [17]:
tf_idf.shape, tf_matrix.shape

((17127, 26517), (17127, 26517))

---

## Evaluation via Information Retrieval

In [18]:
def search(query, k):
    q = transform(query, w2i, all_idf)
    sims = []
    for i in range(tf_idf.shape[1]):
        v = tf_idf[:,i].reshape(-1,)
        sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
        sims.append(sim)
    idx = np.argsort(sims)[::-1]
    return idx[:k]

In [19]:
query = "Trump and Biden"
found_idx = search(query, 10)
[corpus[i] for i in found_idx]

['Were George Reeves and Christopher Reeves related?',
 'How and why did trump win?',
 'Who won the second presidential debate between Trump and Hilary?',
 'Differences between Roman and Greek arts?',
 'How do concentric and eccentric contraction compare and contrast?',
 'How do eccentric and concentric contractions compare and contrast?',
 'Is this dress blue and black, or white and gold?',
 'Difference between criteria and criterion?',
 'Are peanut butter and jelly sandwiches healthy?',
 'Who won the 2016 September 26th presidential debate between Trump and Hillary?']

In [20]:
test_set = queries

In [21]:
avg_acc = []
rng = np.random.default_rng(seed=7)
idx = rng.choice(len(test_set), 100)
for i in tqdm.tqdm(idx):
    query = test_set[i]
    found_idx = set(search(query, 20))
    acc = i in found_idx
    avg_acc.append(acc)
    print("'{}'".format(query), "Found =", bool(acc))
avg_acc = np.mean(avg_acc)
print("Average Accuracy@20", avg_acc)

  0%|          | 0/100 [00:00<?, ?it/s]

'What is the QuickBooks Hosting Support Number?' Found = False
'Who will win the US elections 2016?' Found = False
'Who viewed my profile on Instagram?' Found = False
'How can I hack my phone?' Found = False
'What is the most popular cartoon character in the world?' Found = False
'What is/are your most horrifying stories about ghost or any paranormal activities you ever faced?' Found = False
'Does long distance relationship works?' Found = True
'All biases aside, at this point in time, who do you think will win the presidential election?' Found = False
'How ca n I improve my communication skill?' Found = False
'What is this GST bill all about?' Found = True
'Which book is best for data structures and algorithms for beginners?' Found = False
'What is difference between acid-base indicator and metal ion indicator?' Found = False
'Should India declare war against Pakistan?' Found = False
'What is your review of Passengers (movie) starring Jennifer Lawrence and Chris Pratt?' Found = True
'