# Training Document Retrieval

## Load Data

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
training_corpus = pd.read_csv('drive/My Drive/SMT 2/NLP/Tugas Project/training_corpus_preprocessed.csv')
testing_corpus = pd.read_csv('drive/My Drive/SMT 2/NLP/Tugas Project/testing_corpus_preprocessed.csv')

training_queries = pd.read_csv('drive/My Drive/SMT 2/NLP/Tugas Project/training_queries_preprocessed.csv')
testing_queries = pd.read_csv('drive/My Drive/SMT 2/NLP/Tugas Project/testing_queries_preprocessed.csv')

training_result = pd.read_csv('drive/My Drive/SMT 2/NLP/Tugas Project/training_result.csv')
testing_result = pd.read_csv('drive/My Drive/SMT 2/NLP/Tugas Project/testing_result.csv')

In [None]:
training_corpus.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'url'],inplace=True)
testing_corpus.drop(columns=['vector','Unnamed: 0', 'Unnamed: 0.1', 'url'], inplace=True)

## Remove Punctuation

In [None]:
import re
def punc_remove(sentence):
  punc = re.compile("[^\w\s]")

  test_str = punc.sub(' ',sentence)
  test_str = test_str.split(' ')

  x = ' '.join(test_str)

  return x

In [None]:
from tqdm.auto import tqdm
tqdm.pandas(desc="my bar!")

  from pandas import Panel


In [None]:
training_corpus['cleaned'] = training_corpus['cleaned'].progress_apply(lambda x: punc_remove(x))
testing_corpus['cleaned'] = testing_corpus['cleaned'].progress_apply(lambda x: punc_remove(x))
training_queries['cleaned'] = training_queries['cleaned'].progress_apply(lambda x: punc_remove(x))
testing_queries['cleaned'] = testing_queries['cleaned'].progress_apply(lambda x: punc_remove(x))

HBox(children=(FloatProgress(value=0.0, description='my bar!', max=19505.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='my bar!', max=19570.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='my bar!', max=1000.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='my bar!', max=1000.0, style=ProgressStyle(description_wid…




## Word2Vec

In [None]:
# Combining corpus and queries for training
combined_training=pd.concat([training_corpus.rename(columns={'cleaned':'text'})['text'],\
                             training_queries.rename(columns={'cleaned':'text'})['text']])\
                             .sample(frac=1).reset_index(drop=True)

In [None]:
from gensim.models import Word2Vec

# Creating data for the model training
train_data=[]
for i in combined_training:
    train_data.append(i.split())

# Training a word2vec model from the given data set
w2v_model_skipgram = Word2Vec(train_data, size=300, min_count=2,window=5, sg=1,workers=4)
w2v_model_cbow = Word2Vec(train_data, size=300, min_count=2,window=5, sg=0,workers=4)

In [None]:
import numpy as np

# Function returning vector reperesentation of a document
def get_embedding_w2v_cbow(doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(300)
    else:
        for tok in doc_tokens:
            if tok in w2v_model_cbow.wv.vocab:
                embeddings.append(w2v_model_cbow.wv.word_vec(tok))
            else:
                embeddings.append(np.random.rand(300))
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [None]:
import numpy as np

# Function returning vector reperesentation of a document
def get_embedding_w2v_skipgram(doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(300)
    else:
        for tok in doc_tokens:
            if tok in w2v_model_skipgram.wv.vocab:
                embeddings.append(w2v_model_skipgram.wv.word_vec(tok))
            else:
                embeddings.append(np.random.rand(300))
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [None]:
# Getting Word2Vec Vectors for Testing Corpus and Queries
testing_corpus['vector_cbow']=testing_corpus['cleaned'].apply(lambda x :get_embedding_w2v_cbow(x.split()))
testing_queries['vector_cbow']=testing_queries['cleaned'].apply(lambda x :get_embedding_w2v_cbow(x.split()))

# Getting Word2Vec Vectors for Testing Corpus and Queries
testing_corpus['vector_skipgram']=testing_corpus['cleaned'].apply(lambda x :get_embedding_w2v_skipgram(x.split()))
testing_queries['vector_skipgram']=testing_queries['cleaned'].apply(lambda x :get_embedding_w2v_skipgram(x.split()))

## Accuracy CBOW

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Function for calculating average precision for a query
def average_precision_cbow(qid,qvector):
  
  # Getting the ground truth and document vectors
  qresult=testing_result.loc[testing_result['qid']==qid,['docid','rel']]
  qcorpus=testing_corpus.loc[testing_corpus['docid'].isin(qresult['docid']),['docid','vector_cbow']]
  qresult=pd.merge(qresult,qcorpus,on='docid')
  
  # Ranking documents for the query
  qresult['similarity']=qresult['vector_cbow'].apply(lambda x: cosine_similarity(np.array(qvector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  qresult.sort_values(by='similarity',ascending=False,inplace=True)

  # Taking Top 10 documents for the evaluation
  ranking=qresult.head(10)['rel'].values
  
  # Calculating precision
  precision=[]
  for i in range(1,11):
    if ranking[i-1]:
      precision.append(np.sum(ranking[:i])/i)
  
  # If no relevant document in list then return 0
  if precision==[]:
    return 0

  return np.mean(precision)

# Calculating average precision for all queries in the test set
testing_queries['AP']=testing_queries.apply(lambda x: average_precision_cbow(x['qid'],x['vector_cbow']),axis=1)

# Finding Mean Average Precision
print('Mean Average Precision=>',testing_queries['AP'].mean())

Mean Average Precision=> 0.7761172806437401


## Accuracy SkipGram

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Function for calculating average precision for a query
def average_precision_skipgram(qid,qvector):
  
  # Getting the ground truth and document vectors
  qresult=testing_result.loc[testing_result['qid']==qid,['docid','rel']]
  qcorpus=testing_corpus.loc[testing_corpus['docid'].isin(qresult['docid']),['docid','vector_skipgram']]
  qresult=pd.merge(qresult,qcorpus,on='docid')
  
  # Ranking documents for the query
  qresult['similarity']=qresult['vector_skipgram'].apply(lambda x: cosine_similarity(np.array(qvector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  qresult.sort_values(by='similarity',ascending=False,inplace=True)

  # Taking Top 10 documents for the evaluation
  ranking=qresult.head(10)['rel'].values
  
  # Calculating precision
  precision=[]
  for i in range(1,11):
    if ranking[i-1]:
      precision.append(np.sum(ranking[:i])/i)
  
  # If no relevant document in list then return 0
  if precision==[]:
    return 0

  return np.mean(precision)

# Calculating average precision for all queries in the test set
testing_queries['AP']=testing_queries.apply(lambda x: average_precision_skipgram(x['qid'],x['vector_skipgram']),axis=1)

# Finding Mean Average Precision
print('Mean Average Precision=>',testing_queries['AP'].mean())

Mean Average Precision=> 0.8022947727702198


## Save Model

In [18]:
w2v_model_skipgram.save('drive/My Drive/SMT 2/NLP/Tugas Project/w3v_skipgram.model')
w2v_model_cbow.save('drive/My Drive/SMT 2/NLP/Tugas Project/w3v_cbow.model')