<a href="https://colab.research.google.com/github/nahbos/Advanced-Information-Retrieval/blob/main/Ex01/traditional_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sobhan Moradian Daghigh

- 11-22-2022

### Ex-01: Traditional methods (WarmUp)

In [85]:
import numpy as np
import pandas as pd
import scipy
import re
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
from collections import Counter
import random
import pickle
from tqdm import tqdm

In [2]:
!wget -nc https://raw.githubusercontent.com/nahbos/Advanced-Information-Retrieval/main/Ex01/Data/train_data.csv
!wget -nc https://raw.githubusercontent.com/nahbos/Advanced-Information-Retrieval/main/Ex01/Data/valid_data.csv
!wget -nc https://raw.githubusercontent.com/nahbos/Advanced-Information-Retrieval/main/Ex01/Data/test_data.csv

--2022-11-24 02:43:40--  https://raw.githubusercontent.com/nahbos/Advanced-Information-Retrieval/main/Ex01/Data/train_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5148485 (4.9M) [text/plain]
Saving to: ‘train_data.csv’


2022-11-24 02:43:42 (49.9 MB/s) - ‘train_data.csv’ saved [5148485/5148485]

--2022-11-24 02:43:42--  https://raw.githubusercontent.com/nahbos/Advanced-Information-Retrieval/main/Ex01/Data/valid_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 144317 (141K) [text/plain]
Saving to: ‘valid_data.cs

# Part One. 
* Data Loading

In [3]:
train = pd.read_csv('./train_data.csv')
val   = pd.read_csv('./valid_data.csv')
test  = pd.read_csv('./test_data.csv')

In [4]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,14,29,30,What are the laws to change your status from a...,What are the laws to change your status from a...,0
1,18,37,38,Why are so many Quora users posting questions ...,Why do people ask Quora questions which can be...,1
2,38,77,78,How do we prepare for UPSC?,How do I prepare for civil service?,1
3,58,117,118,I was suddenly logged off Gmail. I can't remem...,I can't remember my Gmail password or my recov...,1
4,60,121,122,How do I download content from a kickass torre...,Is Kickass Torrents trustworthy?,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37250 entries, 0 to 37249
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            37250 non-null  int64 
 1   qid1          37250 non-null  int64 
 2   qid2          37250 non-null  int64 
 3   question1     37250 non-null  object
 4   question2     37250 non-null  object
 5   is_duplicate  37250 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 1.7+ MB


In [6]:
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1010 non-null   int64 
 1   qid1          1010 non-null   int64 
 2   qid2          1010 non-null   int64 
 3   question1     1010 non-null   object
 4   question2     1010 non-null   object
 5   is_duplicate  1010 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 47.5+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            980 non-null    int64 
 1   qid1          980 non-null    int64 
 2   qid2          980 non-null    int64 
 3   question1     980 non-null    object
 4   question2     980 non-null    object
 5   is_duplicate  980 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 46.1+ KB


## Preprocess

In [8]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953

    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [9]:
train['question1'] = train['question1'].apply(preprocess)
train['question2'] = train['question2'].apply(preprocess)

val['question1'] = val['question1'].apply(preprocess)
val['question2'] = val['question2'].apply(preprocess)

test['question1'] = test['question1'].apply(preprocess)
test['question2'] = test['question2'].apply(preprocess)

**Ok, Everything looks right ))**

# Part Two.
* Vector Space Retrieval

In [10]:
dataset = train

# Dictionary length
#     - with    stop words: 9335
#     - without stop words: 9284

tokenized_qs = [simple_preprocess(remove_stopwords(q)) for q in dataset.loc[:, 'question2']]
dct = Dictionary(tokenized_qs)  # fit dictionary
corpus = [dct.doc2bow(tokenized_q) for tokenized_q in tokenized_qs]  # convert corpus to BoW format

In [11]:
model = TfidfModel(corpus)        # fit model
tfidf_vector = model[corpus]      # apply model to the all corpus document

for question in tfidf_vector[:20]:
   print([[dct[id], round(freq, 2)] for id, freq in question])

[['card', 0.24], ['change', 0.23], ['compare', 0.16], ['green', 0.24], ['immigration', 0.38], ['japan', 0.28], ['laws', 0.59], ['status', 0.29], ['student', 0.26], ['visa', 0.3]]
[['answered', 0.51], ['ask', 0.38], ['easily', 0.42], ['google', 0.4], ['people', 0.28], ['questions', 0.33], ['quora', 0.26]]
[['civil', 0.63], ['prepare', 0.5], ['service', 0.59]]
[['email', 0.34], ['gmail', 0.33], ['mail', 0.43], ['password', 0.32], ['recover', 0.38], ['recovery', 0.4], ['remember', 0.42]]
[['kickass', 0.55], ['torrents', 0.5], ['trustworthy', 0.67]]
[['bad', 0.43], ['book', 0.36], ['new', 0.26], ['rowling', 0.79]]
[['english', 0.37], ['fluently', 0.63], ['learn', 0.43], ['speak', 0.54]]
[['actually', 0.55], ['life', 0.69], ['purpose', 0.47]]
[['compare', 0.2], ['cambodia', 0.29], ['earthquake', 0.58], ['effects', 0.53], ['major', 0.26], ['valparaiso', 0.44]]
[['india', 0.3], ['nuclear', 0.71], ['pakistan', 0.49], ['war', 0.41]]
[['ask', 0.36], ['getting', 0.51], ['improve', 0.3], ['marked'

**Since the Gensim dosent support for max_features, so for rest of the implementation, Im gonna use sklearn instead.**


In [12]:
tr_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
tfidf_matrix_train = tr_vectorizer.fit_transform(train.loc[:, 'question2'])
print(tfidf_matrix_train.shape)

(37250, 2000)


In [13]:
ts_vectorizer = TfidfVectorizer(vocabulary=tr_vectorizer.vocabulary_, stop_words='english')
tfidf_matrix_test = ts_vectorizer.fit_transform(test.loc[:, 'question1'])
print(tfidf_matrix_test.shape)

(980, 2000)


In [14]:
similarity = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
similarity.shape

(980, 37250)

In [15]:
def get_similar_questions(test_data, train_data, similarity_matrix, n_sim=10, samples=None):
  for i, test_q in enumerate(similarity_matrix):
    if samples is None or i in samples:
      check_duplicated_qs = []
      bests = np.argsort(test_q.tolist())[::-1]
      print('\n-', test_data.loc[i, 'question1'])
      for best in bests:
        q = train_data.loc[best, 'question2']
        if q not in check_duplicated_qs:
          print('  > ', q)
          check_duplicated_qs.append(q)
          if len(check_duplicated_qs) == n_sim:
            break

In [16]:
random.seed(2)
random_tests = random.sample(range(0, len(test)), 5)
get_similar_questions(test, train, similarity, samples=random_tests)


- who will win the election in united states
  >  who will win the 2016 united states presidential election  trump or clinton
  >  who is the coolest first lady of the united states
  >  who will win the us election
  >  who will win uttar pradesh election
  >  who should be the next president of the united states
  >  who do you think will win the u s  election in november
  >  who will win the us election in 2016
  >  does the president of the united states have a food taster
  >  who will win 2017 uttar pradesh election and why
  >  who will win up 2017 election

- how do i recover my gmail account when it does not open after password reset
  >  how can i recover my gmail account s password
  >  how do you recover your gmail account password
  >  how do i reset my gmail account password
  >  how can i reset the password for my gmail account
  >  how do i recover my gmail password
  >  how can you recover your gmail password
  >  how do i reset my gmail password
  >  how do i reset 

# Part Three.
* Language Model Retrieval

(N / (N + mu)) * (TFw,D / |D|) 

\+ (mu / (N + mu)) * (CFw / |c|)


In [81]:
def probability_word_given_doc(word, doc, collection, collection_length, collection_freq, _lambda=10):
    doc_tf = doc.count(word)
    document_length = max(len(doc.split()), 1)
    collection_tf = collection_freq.get(word)

    return np.add(np.multiply(np.divide(document_length, np.add(document_length, _lambda)), 
                              np.divide(doc_tf, document_length)), 
                  np.multiply(np.divide(_lambda, np.add(document_length, _lambda)), 
                              np.divide(collection_tf, collection_length)))

In [84]:
tokenized_qs_train = [simple_preprocess(remove_stopwords(q)) for q in train.loc[:, 'question2']]
tokenized_qs_val   = [simple_preprocess(remove_stopwords(q)) for q in   val.loc[:, 'question1']]

collection = ' '.join(' '.join(ele) for ele in tokenized_qs_train)  # Stop words removed
collection_freq = dict(Counter(collection.split()))
collection_length = len(collection_freq)

for i, val_q in enumerate(tokenized_qs_val[:3]):
    scores = []
    for train_q in tqdm(tokenized_qs_train):
        prob = 1
        for token in val_q:
            prob = np.multiply(prob, probability_word_given_doc(word=token, doc=' '.join(train_q), collection=collection, collection_length=collection_length, collection_freq=collection_freq))
        scores.append(prob)

    print('\nValidation: ', val.loc[i, 'question1'])
    print('Train:      ', train.loc[:, 'question2'][np.argsort(scores)[::-1][0]])
    print()

100%|██████████| 37250/37250 [00:09<00:00, 3986.42it/s]



Validation:  realistically speaking  what would happen to the usa if donald trump wins presidency in the 2016 elections
Train:       what will happen to the superpower status of the usa  if donald trump wins the 2016 presidential elections



100%|██████████| 37250/37250 [00:02<00:00, 14413.15it/s]



Validation:  does global warming exist
Train:       was global warming replaced by climate change because they found there was no global warming



100%|██████████| 37250/37250 [00:02<00:00, 14129.09it/s]


Validation:  how do i make india as corruption free
Train:       what i can do for corruption free india






# Part Four.
* Evaluation Metrics

### Finito