In [4]:
import numpy as np
import pandas as pd
import pickle
import MicroTokenizer
import os
from tqdm.notebook import tqdm
from rank_bm25 import BM25Okapi, BM25Plus
from pythainlp.tokenize import word_tokenize 
from sklearn.preprocessing import OneHotEncoder


In [5]:
def bm25_encode(contexts,language):
    if language == 'zh':
        context_split = [*map(MicroTokenizer.cut,contexts)]
    elif language == 'th':
        context_split = [word_tokenize(x, engine="newmm") for x in contexts]
    else:
        context_split = [x.split(' ') for x in contexts]
        
    context_encoded = BM25Okapi(context_split)
    return context_encoded

def bm25_scoring(question,context_encoded,language):
    if language == 'zh':
        tokenized_query = MicroTokenizer.cut(question)
    elif language == 'th':
        tokenized_query = word_tokenize(question, engine="newmm")
    else:
        tokenized_query = question.split(' ')
        
    ranking = context_encoded.get_scores(tokenized_query)
    return np.argsort(ranking)[::-1]

In [6]:
def get_score_para(queries,query_id,paras,para_id,bm25_para_encoded,lan,para_df_f):
    triplet_datasets = []
    for idx,q in tqdm(enumerate(queries)):
        index_sorted = bm25_scoring(q,bm25_para_encoded,lan)
        para_answer = para_df_f[para_df_f['para_id'] == query_id[idx]].para.values.tolist()
        count = 0
        for index in index_sorted[top_start:]:
            para_neg = paras[index]
            if count == top_end:
                break
            if para_neg not in para_answer:
                for p_a in para_answer:
                    triplet_datasets.append([q,p_a,para_neg,query_id[idx],para_id[index]])
                count+=1
    return triplet_datasets

In [7]:
def get_score_para_contrastive(queries,query_id,paras,para_id,bm25_para_encoded,lan,para_df_f):
    triplet_datasets = []
    for idx,q in tqdm(enumerate(queries)):
        index_sorted = bm25_scoring(q,bm25_para_encoded,lan)
        para_answer = para_df_f[para_df_f['para_id'] == query_id[idx]].para.values.tolist()
        count = 0
        for index in index_sorted[top_start:]:
            para_neg = paras[index]
            if count == top_end:
                break
            if para_neg not in para_answer:
                for p_a in para_answer:
                    triplet_datasets.append([q,p_a,para_neg,query_id[idx],para_id[index]])
                count+=1
    return triplet_datasets

In [8]:
def get_score_para_aligned(queries,query_id_doc,query_id_para,paras,para_id,bm25_para_encoded,lan,para_df_f):
    triplet_datasets = []
    for idx,q in tqdm(enumerate(queries)):
        index_sorted = bm25_scoring(q,bm25_para_encoded,lan)
        para_answers = para_df_f[para_df_f['doc_id'] == query_id_doc[idx]].para.values.tolist()
        para_item = para_df_f[para_df_f['para_id'] == query_id_para[idx]].para.values.tolist()
        if len(para_item) > 1:
            raise Exception(f'Error at query: {para_item}')
        count = 0
        for index in index_sorted[top_start:]:
            para_neg = paras[index]
            if count == top_end:
                break
            if para_neg not in para_answers:
                for p_a in para_item:
                    triplet_datasets.append([q,p_a,para_neg])
                count+=1
    return triplet_datasets

In [9]:
def get_score_para_doc(queries,query_id,paras,para_id,bm25_para_encoded,lan):
    triplet_datasets = []
    for idx,q in tqdm(enumerate(queries)):
        index_sorted = bm25_scoring(q,bm25_para_encoded,lan)
        count = 0
        for index in index_sorted[:]:
            para_neg = paras[index]
            if count == top_end:
                break
            if para_neg not in paras[idx]:
                triplet_datasets.append([q,paras[idx],para_neg])
                count+=1
    return triplet_datasets

def get_score_doc(queries,query_id,docs,docs_id,bm25_doc_encoded,lan):
    triplet_datasets = []
    for idx,q in enumerate(queries):
        index_sorted = bm25_scoring(q,bm25_doc_encoded,lan)
        doc_answer_idx = docs_id.index(query_id[idx])
        doc_answer = docs[doc_answer_idx]
        count = 0
        for index in index_sorted[:]:
            if count == top_end:
                break
            if index != doc_answer_idx:
                triplet_datasets.append([q,doc_answer,docs[index]])
                count+=1
    return triplet_datasets

In [10]:
def paragraph_splited(doc,doc_id):
    splited = []
    data_frame = []
    for idx,d in enumerate(doc):
        for d_p in d.split('\n'):
            if d_p != '':
                data_frame.append([doc_id[idx],d_p])
                splited.append(d_p)
    data_frame = pd.DataFrame(data_frame ,columns =['para_id', 'para'])
    return splited,data_frame

In [11]:
import string
exclude = set(string.punctuation)
def clean_text(text):
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.lower()
    return text

In [16]:
mode = 'train'
q_lan = ['en','ar','de','es','hi','vi','zh','ru','bn','fi','ja','ko','te','fr','th','el','ro','tr']
corpus = 'XORQA'

path = f'data_preprocess/{corpus}/{mode}/'
top_start = 1  
top_end = 16

df_question = {}
df_paragraph = {}
df_doc = {}
df_question_og = {}
lan_now = []

for lan in q_lan:
    try:
        question_temp = pd.read_csv(f'data_preprocess/{corpus}/{mode}/{corpus.lower()}_question_en-{lan}_en.csv')
        question_temp_2 = pd.read_csv(f'data_preprocess/{corpus}/{mode}/{corpus.lower()}_question_en-{lan}.csv')
        doc_temp = pd.read_csv(f'data_preprocess/{corpus}/{mode}/{corpus.lower()}_doc_en-{lan}.csv')
        question_temp = question_temp.dropna()

        df_question.update({
            f'en-{lan}': question_temp
        })
        df_question_og.update({
            f'en-{lan}': question_temp_2
        })
        
#         df_paragraph.update({
#             f'en-{lan}':pd.read_csv(f'data_preprocess/{corpus}/{mode}/{corpus.lower()}_para_en-{lan}.csv')
#         })

        df_doc.update({
            f'en-{lan}': doc_temp
        })
        lan_now.append(lan)
    except:
        pass
    
q_lan = lan_now

['ru', 'bn', 'fi', 'ja', 'ko', 'te']

# Aligned

In [29]:
top_start = 1  
top_end = 3
context_mode = 'para_aligned'
for idx,lan in enumerate(q_lan[:1]):
    print(f'Lan:{idx+1}/{len(q_lan)}')
    question_id_doc = df_question[f'en-{lan}']['doc_id'].to_list()
    question_id_para = df_question[f'en-{lan}']['paragraph_id'].to_list()
    questions = df_question[f'en-{lan}']['question'].to_list()

    para_raw = df_paragraph[f'en-{lan}']['para'].to_list()

    para_encoded = bm25_encode(para_raw,lan)

    trip_data = get_score_para_aligned(questions,question_id_doc,question_id_para,para_raw,df_paragraph[f'en-{lan}']['doc_id'].to_list(),para_encoded,lan,df_paragraph[f'en-{lan}'])
    df = pd.DataFrame(trip_data,columns=['question','anchor','negative'])
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    if not os.path.exists(f'{path}/triplet'):
        os.makedirs(f'{path}/triplet')
    df.to_csv(f'{path}/triplet/triplet_en-{lan}_top{top_start}-{top_end}_{context_mode}_new_no_same_doc.csv',index=False)


Lan:1/12


KeyError: 'paragraph_id'

# Not aligned

In [38]:
top_start = 1  
top_end = 3
context_mode = 'para'

for idx,lan in enumerate(q_lan[:]):
    print(f'Lan:{idx+1}/{len(q_lan)}')
    q_no_dup = df_question[f'en-{lan}'].drop_duplicates(subset=['question'])
    
    question_id = q_no_dup['doc_id'].to_list()
    questions = q_no_dup['question'].to_list()
    questions = list(map(clean_text,questions))
    
    doc_context_id = df_doc[f'en-{lan}']['doc_id'].to_list()
    doc_context_raw = df_doc[f'en-{lan}']['doc'].to_list()
    doc_context_raw = list(map(clean_text,doc_context_raw))
    
    para_split,para_df = paragraph_splited(doc_context_raw,doc_context_id)

    para_encoded = bm25_encode(para_split,lan)

    trip_data = get_score_para(questions,question_id,para_split,para_df['para_id'].to_list(),para_encoded,lan,para_df)
    df = pd.DataFrame(trip_data,columns=['anchor','positive','negative','a_p_id','n_id'])
    ne_df = df_question_og[f"en-{lan}"]
    df_concat = pd.merge(df,fi_df,left_on='a_p_id',right_on='doc_id')
    df_concat = df_concat.drop(columns=['anchor'])
    df_concat = df_concat.rename(columns={"question": "anchor"})
    
    df = df_concat[['anchor','positive','negative']]
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    if not os.path.exists(f'{path}/triplet'):
        os.makedirs(f'{path}/triplet')
    df.to_csv(f'{path}/triplet/triplet_en-{lan}_top{top_start}-{top_end}_{context_mode}_new.csv',index=False)


Lan:1/4


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Lan:2/4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Lan:3/4


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


Lan:4/4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
