# data analyze

In [1]:
import pandas as pd
import re
import numpy as np

from collections import Counter

In [2]:
from rank_bm25 import BM25Okapi
import nltk
from nltk.stem.porter import PorterStemmer

# nltk.download('punkt')
# nltk.download('stopwords')

def stem_tokenize(text, remove_stopwords=True):
    stemmer = PorterStemmer()
    tokens = [word for sent in nltk.sent_tokenize(text) \
                                      for word in nltk.word_tokenize(sent)]
    tokens = [word for word in tokens if word not in \
          nltk.corpus.stopwords.words('english')]
#     tokens = [word for word in tokens if not re.search(r'[^A-Za-z0-9]', word)]
    return [stemmer.stem(word) for word in tokens]

In [3]:
def load_df(filepath):
    topic_df = pd.read_csv(filepath, sep='\t')
    topic_df = topic_df.drop_duplicates(['topic_id', 'question_id']).reset_index(drop=True)
    topic_df.fillna('', inplace=True)
    return topic_df

In [4]:
train_df = load_df('../data/train.tsv')
dev_df = load_df('../data/dev.tsv')
question_bank = pd.read_csv('../data/question_bank.tsv', sep='\t').dropna()
all_df = pd.concat([train_df, dev_df])

In [None]:
# reversed index
word2question = {}
for q in question_bank['question'].values:
    for c in stem_tokenize(q):
        if c not in word2question:
            word2question[c] = []
        word2question[c].append(q)

In [6]:
def see_question_infos(topic_df):
    tmp = topic_df[['topic_id', 'question']].groupby(by=['question']).count()
    question2cnt = {tid: cnt for tid, cnt in zip(tmp.index, tmp['topic_id']) if cnt > 1}
    return sorted(question2cnt.items(), key=lambda x: x[1], reverse=True)
    

In [11]:
train_infos = see_question_infos(train_df)
print(len(train_df['topic_id'].unique()))
print(train_infos)

187
[('', 159), ('are you looking for a specific web site', 18), ('are you looking for a specific web page', 10), ('are you interested in a specific web page', 4), ('are you looking for a specific website', 2), ('do you want a map', 2), ('do you want a phone number to call', 2), ('do you want the address', 2), ('do you want to know the address', 2), ('do you want to know their hours of operation', 2), ('is there any specific web page on your mind', 2), ('is this an emergency', 2), ('what is your price range', 2)]


In [10]:
dev_infos = see_question_infos(dev_df)
print(len(dev_df['topic_id'].unique()))
print(dev_infos)

50
[('', 39), ('are you looking for a specific web page', 3), ('are you looking for a specific web site', 3), ('would you like to buy a book about this topic', 2)]


In [12]:
all_infos = see_question_infos(pd.concat([train_df, dev_df]))
print(all_infos)

[('', 198), ('are you looking for a specific web site', 21), ('are you looking for a specific web page', 13), ('are you interested in a specific web page', 4), ('would you like to buy a book about this topic', 3), ('are you looking for a specific movie', 2), ('are you looking for a specific website', 2), ('are you looking for any specific photos', 2), ('are you referring to a software', 2), ('do you want a map', 2), ('do you want a phone number to call', 2), ('do you want the address', 2), ('do you want to know the address', 2), ('do you want to know their hours of operation', 2), ('is there any specific web page on your mind', 2), ('is this an emergency', 2), ('what is your price range', 2)]


In [39]:
# dev in train

devq2cnt = {}
trainq = set([_q for _q in train_df['question']])
for t, q in dev_df[['initial_request', 'question']].values:
    if q in trainq:
        if q not in devq2cnt:
            devq2cnt[q] = [0, []]
        devq2cnt[q][0] += 1
        devq2cnt[q][1].append(t)
sorted(devq2cnt.items(), key=lambda x: x[1][0], reverse=True)

In [54]:
normal_words = ['software', 'movie', 'web', 'page', 'website', 'webpage', 'phone', 'photo', 'book', 'address']

In [56]:
normal_questions = []
for w in normal_words:
    c = stem_tokenize(w)[0]
    normal_questions.extend(word2question.get(c, []))

In [57]:
len(normal_questions)

202

In [71]:
normal_questions.extend([_q for _q, _c in all_infos])

In [72]:
normal_questions = set(normal_questions)

In [73]:
len(normal_questions)

190

In [80]:
normal_qids = set(question_bank[question_bank['question'].isin(normal_questions)]['question_id'].values.tolist())

# build bm25

In [5]:
# Reads files and build bm25 corpus (index)
question_bank = pd.read_csv('../data/question_bank.tsv', sep='\t').fillna('')

question_bank['tokenized_question_list'] = question_bank['question'].map(stem_tokenize)
question_bank['tokenized_question_str'] = question_bank['tokenized_question_list'].map(lambda x: ' '.join(x))


In [6]:
added_tokens = []
added_cnames = ['initial_request', 'answer', 'topic_desc']
for qid in question_bank['question_id'].values:
    words = []
    for cname in added_cnames:
        irs = train_df[train_df['question_id'] == qid][cname].unique()
#         irs = all_df[all_df['question_id'] == qid][cname].unique()
        for ir in irs:
            ws = stem_tokenize(ir)
            words.extend(ws)
    words = list(set(words))
    added_tokens.append(words)
question_bank['tokens_from_train'] = added_tokens
question_bank['all_tokens'] = question_bank['tokenized_question_list'] + question_bank['tokens_from_train']
question_bank['all_token_str'] = question_bank['all_tokens'].map(lambda x: ' '.join(x))

# add train_df initial_request tokens
# bm25_corpus = question_bank['tokenized_question_list'].tolist()
bm25_corpus = question_bank['all_tokens'].tolist()
bm25 = BM25Okapi(bm25_corpus)

In [7]:
bm25_corpus_onlyq = question_bank['tokenized_question_list'].tolist()
bm25_onlyq = BM25Okapi(bm25_corpus_onlyq)

In [43]:
question_bank['tokens_from_train_len'] = question_bank['tokens_from_train'].apply(len)
notin_train = question_bank[question_bank['tokens_from_train_len'] == 0]['question_id'].values.tolist()
notin_train = sorted(notin_train, key=lambda x: len(question_bank[question_bank['question_id'] == x]['tokenized_question_str'].values[0]))
len(notin_train)

1538

In [8]:
notin_train = question_bank['question_id'].values.tolist()
notin_train = sorted(notin_train, key=lambda x: len(question_bank[question_bank['question_id'] == x]['tokenized_question_str'].values[0]))
len(notin_train)

3941

# test normal_qids

In [9]:
def get_bm25_recall_results(dev, ir_id_columns='topic_id', ir_columns='initial_request', topn=-1):
    all_preds = {}
    for tid in dev[ir_id_columns].unique():
        query = dev.loc[dev[ir_id_columns]==tid, ir_columns].tolist()[0]
        if 'clarification_need' in dev.columns:
            cneed = dev.loc[dev[ir_id_columns]==tid, 'clarification_need'].tolist()[0]
        else:
            cneed = 0
        bm25_ranked_list = bm25.get_top_n(stem_tokenize(query, True), bm25_corpus, n=100)
        bm25_q_list = [' '.join(sent) for sent in bm25_ranked_list]
        preds = question_bank.set_index('all_token_str').loc[bm25_q_list, 'question_id'].tolist()
        
        bm25_ranked_list_onlyq = bm25_onlyq.get_top_n(stem_tokenize(query, True), bm25_corpus_onlyq, n=100)
        bm25_q_list_onlyq = [' '.join(sent) for sent in bm25_ranked_list_onlyq]
        preds_onlyq = question_bank.set_index('tokenized_question_str').loc[bm25_q_list_onlyq, 'question_id'].tolist()
        
        insert_qs = [_q for _q in preds if _q in preds_onlyq]
#         print(tid, len(preds), len(preds_onlyq), len(insert_qs))
        new_preds = []
        for q in insert_qs + [_q for _q in preds if _q not in insert_qs]:
            new_preds.append((q, 1, cneed))  # qid, is_bm25, cneed_label
        for q in notin_train:
             if q not in preds:
                new_preds.append((q, 0, cneed))  # qid, is_bm25, cneed_label
        if topn > 0:
            all_preds[tid] = new_preds[:topn]
        else:
            all_preds[tid] = new_preds
    
    return all_preds

In [10]:
def get_bm25_recall_results_with_score(dev, ir_id_columns='topic_id', ir_columns='initial_request', topn=-1):
    all_preds = {}
    for tid in dev[ir_id_columns].unique():
        query = dev.loc[dev[ir_id_columns]==tid, ir_columns].tolist()[0]
        if 'clarification_need' in dev.columns:
            cneed = dev.loc[dev[ir_id_columns]==tid, 'clarification_need'].tolist()[0]
        else:
            cneed = 0
#         bm25_ranked_list = bm25.get_top_n(stem_tokenize(query, True), bm25_corpus, n=100)
        bm25_scores = bm25.get_scores(stem_tokenize(query, True))
        bm25_ranked_idxs = np.argsort(bm25_scores)[::-1]
        if topn > 0:
            bm25_ranked_idxs = bm25_ranked_idxs[:100]
        bm25_ranked_list = [bm25_corpus[_i] for _i in bm25_ranked_idxs]
        bm25_ranked_scores = [bm25_scores[_i] for _i in bm25_ranked_idxs]
        bm25_q_list = [' '.join(sent) for sent in bm25_ranked_list]
        preds = question_bank.set_index('all_token_str').loc[bm25_q_list, 'question_id'].tolist()
        
        bm25_ranked_list_onlyq = bm25_onlyq.get_top_n(stem_tokenize(query, True), bm25_corpus_onlyq, n=100)
        bm25_q_list_onlyq = [' '.join(sent) for sent in bm25_ranked_list_onlyq]
        preds_onlyq = question_bank.set_index('tokenized_question_str').loc[bm25_q_list_onlyq, 'question_id'].tolist()
        
        insert_qs = [(_q, _s) for _q, _s in zip(preds, bm25_ranked_scores) if _q in preds_onlyq]
        other_qs = [(_q, _s) for _q, _s in  zip(preds, bm25_ranked_scores) if _q not in preds_onlyq]
        new_preds = []
        for q in insert_qs + other_qs:
            new_preds.append((q[0], q[1], 1, cneed))  # qid, is_bm25, cneed_label
        for q in notin_train:
             if q not in preds:
                new_preds.append((q, 0., 0, cneed))  # qid, is_bm25, cneed_label
        if topn > 0:
            all_preds[tid] = new_preds[:topn]
        else:
            all_preds[tid] = new_preds
    
    return all_preds

In [11]:
dev = pd.read_csv('../data/dev.tsv', sep='\t')
all_preds = get_bm25_recall_results_with_score(dev, 'topic_id', 'initial_request', topn=-1)

197050

In [11]:
dev = pd.read_csv('../data/dev.tsv', sep='\t')

run_file_path = './dev_bm25_added_normal'
with open(run_file_path, 'w') as fo:
    all_preds = get_bm25_recall_results_with_score(dev, topn=2000)
    for tid, new_preds in all_preds.items():
        for i, qinfos in enumerate(new_preds):
            fo.write('{} 0 {} {} {} bm25\n'.format(tid, qinfos[0], i, len(new_preds)-i))

In [12]:
! python ../official_src/clariq_eval_tool.py    --eval_task question_relevance\
                                --data_dir ../data/ \
                                --experiment_type dev \
                                --run_file {run_file_path}

Recall5: 0.34356393718081024
Recall10: 0.6078327634697603
Recall20: 0.6943680581544358
Recall30: 0.7056124020458385
Recall50: 0.7166712255752503
Recall100: 0.7180045589085837
Recall200: 0.7902837041699271
Recall300: 0.8044615242858277
Recall400: 0.8239103499699475
Recall500: 0.836058809353701
Recall600: 0.8482471847038411
Recall700: 0.8564838255140113
Recall800: 0.8709558143095295
Recall900: 0.8812092079746878
Recall1000: 0.897132179997505
Recall1100: 0.8999893228546479
Recall1200: 0.9086854012860205
Recall1300: 0.9137576407081051
Recall1400: 0.9207912541534833
Recall1500: 0.9281868585490877
Recall1600: 0.9355586534208825
Recall1700: 0.9408829936833033
Recall1800: 0.9423115651118747
Recall1900: 0.9450734698737795
Recall2000: 0.9502499404620147


In [13]:
! python ../official_src/clariq_eval_tool.py    --eval_task document_relevance\
                                --data_dir ../data/ \
                                --experiment_type dev \
                                --run_file {run_file_path}

NDCG1: 0.18802083333333333
NDCG3: 0.1684093677732618
NDCG5: 0.16621118253836253
NDCG10: 0.1506512398424613
NDCG20: 0.13655006163151873
P1: 0.23125
P3: 0.20416666666666666
P5: 0.19375
P10: 0.15687500000000001
P20: 0.1225
MRR100: 0.317345372146148


# random sampling

In [13]:
all_qids = question_bank['question_id'].values
def get_random_sample(dev, ir_id_columns='topic_id', ir_columns='initial_request', topn=500):
    all_preds = {}
    for tid in tqdm(dev[ir_id_columns].unique()):
        all_preds[tid] = []
        random_samples = np.random.choice(all_qids, topn, replace=False)
        for qid in random_samples:
            all_preds[tid].append((qid,))
    return all_preds

# negative sampling

In [14]:
from tqdm import tqdm

In [15]:
def build_dataset(dataset, ir_id_columns='topic_id', ir_columns='initial_request', topn=500, is_test=False):
    tid2qids = get_bm25_recall_results_with_score(dataset, ir_id_columns, ir_columns, topn=topn)
#     tid2qids = get_random_sample(dataset, ir_id_columns, ir_columns, topn)
    if is_test:
        new_tid2qids = {}
        for tid in tqdm(tid2qids):
            new_tid2qids[tid] = []
            for qinfo in tid2qids[tid]:
                new_qinfo = qinfo + (0,)   # qid, is_bm25, bm25_score, cneed, is_next
                new_tid2qids[tid].append(new_qinfo)
        return new_tid2qids
    else:
        new_tid2qids = {}
        for tid in tqdm(tid2qids):
            pos_qids = dataset[dataset[ir_id_columns] == tid]['question_id'].unique().tolist()
            cneed = dataset[dataset[ir_id_columns] == tid]['clarification_need'].values[0]
            pos_qids = set(pos_qids)
            new_tid2qids[tid] = []
            for qid in pos_qids:
                new_qinfo = (qid, False, 0., cneed, 1)
#                 new_qinfo = (qid, 1)
                new_tid2qids[tid].append(new_qinfo)
            for qinfo in tid2qids[tid]:
                if qinfo[0] not in pos_qids:
                    new_qinfo = qinfo + (0,)   # qid, is_bm25, bm25_score, cneed, is_next
                    new_tid2qids[tid].append(new_qinfo)
        return new_tid2qids

In [16]:
def format_dataset(tid2qids, dataset, question_bank, ir_id_columns='topic_id', ir_columns='initial_request', is_test=False):
    tmp_df = {'tid': [], 'irid': [], 'initial_request': [], 'qid': [], 'question': [], 'label': [], 'is_bm25': [], 'bm25_score': [], 'cneed': []}
#     tmp_df = {'tid': [], 'initial_request': [], 'qid': [], 'question': [], 'label': []}
    for tid, irid, query in tqdm(dataset[['topic_id', ir_id_columns, ir_columns]].drop_duplicates().values):
        qinfos = tid2qids[irid]
        
        for qinfo in qinfos:
            qid = qinfo[0]
            question = question_bank[question_bank['question_id'] == qid]['question'].values[0]
            tmp_df['tid'].append(tid)
            tmp_df['qid'].append(qid)
            tmp_df['irid'].append(irid)
            tmp_df['initial_request'].append(query)
            tmp_df['question'].append(question)
            tmp_df['label'].append(qinfo[-1])
            tmp_df['bm25_score'].append(qinfo[1])
            tmp_df['is_bm25'].append(qinfo[2])
            tmp_df['cneed'].append(qinfo[3])
            
    tmp_df = pd.DataFrame(tmp_df)
    return tmp_df
    

In [58]:
# use bm25 to sample neg_data
trainset = pd.read_csv('../data/train.tsv', sep='\t')
devset = pd.read_csv('../data/dev.tsv', sep='\t')


In [59]:
ir_id_columns = 'topic_id'
ir_columns='initial_request'
train_tid2qids = build_dataset(trainset, ir_id_columns, ir_columns, topn=500)
traindf1 = format_dataset(train_tid2qids, trainset, question_bank, ir_id_columns, ir_columns)
len(traindf1)

100%|██████████| 187/187 [00:00<00:00, 327.39it/s]
100%|██████████| 187/187 [01:17<00:00,  2.41it/s]


93657

In [34]:
traindf1.to_csv('../data/pair_datas_src/pair_trainset.tsv', sep='\t', index=None)

In [60]:
ir_id_columns = 'topic_id'
ir_columns='topic_desc'
train_tid2qids2 = build_dataset(trainset, ir_id_columns, ir_columns, topn=200)
traindf2 = format_dataset(train_tid2qids2, trainset, question_bank, ir_id_columns, ir_columns)
len(traindf2)

100%|██████████| 187/187 [00:00<00:00, 480.60it/s]
100%|██████████| 187/187 [00:31<00:00,  5.92it/s]


37560

In [61]:
ir_id_columns = 'facet_id'
ir_columns='facet_desc'
train_tid2qids3 = build_dataset(trainset, ir_id_columns, ir_columns, topn=200)
traindf3 = format_dataset(train_tid2qids3, trainset, question_bank, ir_id_columns, ir_columns)
len(traindf3)

100%|██████████| 638/638 [00:01<00:00, 395.81it/s]
100%|██████████| 638/638 [01:48<00:00,  5.85it/s]


128597

In [74]:
traindf = pd.concat([traindf1, traindf2, traindf3])
len(traindf)

259814

In [75]:
traindf.to_csv('../data/pair_datas/pair_trainset.tsv', sep='\t', index=None)

 26%|██▌       | 17588/68556 [00:40<01:35, 534.75it/s]

In [63]:
ir_id_columns = 'topic_id'
ir_columns='initial_request'
dev_tid2qids = build_dataset(devset, ir_id_columns, ir_columns, topn=500)
devdf1 = format_dataset(dev_tid2qids, devset, question_bank, ir_id_columns, ir_columns)
len(devdf1)

100%|██████████| 50/50 [00:00<00:00, 365.97it/s]
100%|██████████| 50/50 [00:20<00:00,  2.43it/s]


25107

In [32]:
devdf1.to_csv('../data/pair_datas_src/pair_devset.tsv', sep='\t', index=None)

In [64]:
ir_id_columns = 'topic_id'
ir_columns='topic_desc'
dev_tid2qids2 = build_dataset(devset, ir_id_columns, ir_columns, topn=200)
devdf2 = format_dataset(dev_tid2qids2, devset, question_bank, ir_id_columns, ir_columns)
len(devdf2)

100%|██████████| 50/50 [00:00<00:00, 357.68it/s]
100%|██████████| 50/50 [00:08<00:00,  5.76it/s]


10160

In [65]:
ir_id_columns = 'facet_id'
ir_columns='facet_desc'
dev_tid2qids3 = build_dataset(devset, ir_id_columns, ir_columns, topn=200)
devdf3 = format_dataset(dev_tid2qids3, devset, question_bank, ir_id_columns, ir_columns)
len(devdf3)

100%|██████████| 163/163 [00:00<00:00, 365.84it/s]
100%|██████████| 163/163 [00:27<00:00,  5.85it/s]


33289

In [66]:
devdf = pd.concat([devdf1, devdf2, devdf3])
len(devdf)

68556

In [67]:
devdf.to_csv('../data/pair_datas/pair_devset.tsv', sep='\t', index=None)

In [17]:
testset = pd.read_csv('../data/dev.tsv', sep='\t')
# testset.rename(columns={'initial request': 'initial_request'}, inplace=True)
print(testset.columns)
test_tid2qids = build_dataset(testset, ir_id_columns='topic_id', ir_columns='initial_request', topn=2000, is_test=False)
test_df = format_dataset(test_tid2qids, testset, question_bank, ir_id_columns='topic_id', ir_columns='initial_request', is_test=False)
len(test_df)

Index(['topic_id', 'initial_request', 'topic_desc', 'clarification_need',
       'facet_id', 'facet_desc', 'question_id', 'question', 'answer'],
      dtype='object')


100%|██████████| 50/50 [00:00<00:00, 293.73it/s]
100%|██████████| 50/50 [01:14<00:00,  1.49s/it]


100036

In [18]:
test_df.to_csv('../data/pair_datas/pair_devset_for_test_new.tsv', sep='\t', index=None)

In [19]:
testset = pd.read_csv('../data/test.tsv', sep='\t')
testset.rename(columns={'initial request': 'initial_request'}, inplace=True)
print(testset.columns)
test_tid2qids = build_dataset(testset, ir_id_columns='topic_id', ir_columns='initial_request', topn=2000, is_test=True)
test_df = format_dataset(test_tid2qids, testset, question_bank, ir_id_columns='topic_id', ir_columns='initial_request', is_test=True)
len(test_df)

Index(['topic_id', 'initial_request'], dtype='object')


100%|██████████| 61/61 [00:00<00:00, 895.04it/s]
100%|██████████| 61/61 [01:35<00:00,  1.56s/it]


122000

In [20]:
test_df.to_csv('../data/pair_datas/pair_testset_new.tsv', sep='\t', index=None)

In [58]:
testset = pd.read_csv('../data/test.tsv', sep='\t')
testset.rename(columns={'initial request': 'initial_request'}, inplace=True)
test_tid2qids = {}
for tid in testset['topic_id'].values:
    test_tid2qids[tid] = set(question_bank['question_id'].values.tolist())
test_df = format_dataset(test_tid2qids, testset, question_bank, is_test=True)
len(test_df)

240401

# add document relevent scores data

In [1]:
import pandas as pd
import numpy as np
import _pickle as pickle
from tqdm import tqdm

In [2]:
def load_eval_dict(eval_file_path, topic_file_path):
    topic_df = pd.read_csv(topic_file_path, sep='\t')
    facet_array = topic_df['facet_id'].values
    eval_dict = pickle.load(open(eval_file_path, 'rb'))
    # we keep only the instances in the topic file.
    new_eval_dict = {}
    for metric in eval_dict:
        new_eval_dict[metric] = {}
        for fid in eval_dict[metric]:
            if fid in facet_array:
                new_eval_dict[metric][fid] = eval_dict[metric][fid]
    return new_eval_dict

In [3]:
# train
topic_file_path = '../data/train.tsv'
eval_file_path = '../data/single_turn_train_eval.pkl'
new_train_eval_dict = load_eval_dict(eval_file_path, topic_file_path)

In [4]:
# dev
topic_file_path = '../data/dev.tsv'
eval_file_path = '../data/single_turn_train_eval.pkl'
new_dev_eval_dict = load_eval_dict(eval_file_path, topic_file_path)

In [5]:
def add_dr_score(dev_df, devset, new_eval_dict, dr_key='MRR100'):
#     dev_df = pd.read_csv(topic_file_path, sep='\t')
#     dev_tid2fid = dev_df.set_index('topic_id')['facet_id'].to_dict()

#     devset = pd.read_csv(format_file_path, sep='\t')
    vals = []
    for tid, qid in tqdm(devset[['tid', 'qid']].values):
        ttt = (dev_df['topic_id'] == tid) & (dev_df['question_id'] == qid)
        fids = dev_df[ttt]['facet_id'].values.tolist()
        mrr = []
        for fid in fids:
            mrr.append(0.)
            if fid in new_eval_dict[dr_key]:
                if qid in new_eval_dict[dr_key][fid]:
                    mrr[-1] = new_eval_dict[dr_key][fid][qid]['with_answer']
        vals.append(np.mean(mrr) if len(mrr) > 0 else 0.)
    devset[dr_key] = vals
    return devset

In [26]:
def get_max_score_tids(devset, dr_key='NDCG3'):
    tid2maxscore = {}
    for tid in tqdm(devset['tid'].unique()):
        tmp = devset[devset['tid'] == tid][dr_key].values
        maxscore = np.max(tmp)
        tid2maxscore[tid] = maxscore
    
    is_max_drkey = []
    for tid, qid, val in tqdm(devset[['tid', 'qid', dr_key]].values):
        if val == tid2maxscore[tid] and val > 0:
            is_max_drkey.append(1)
        else:
            is_max_drkey.append(0)
    
    devset['top1_label'] = is_max_drkey
    return devset


In [7]:
dev_df = pd.read_csv('../data/dev.tsv', sep='\t')
devset = pd.read_csv('../data/pair_datas_src/pair_devset.tsv', sep='\t')
for dr_key in ['MRR100', 'P1', 'NDCG3']:
    devset = add_dr_score(dev_df, devset, new_dev_eval_dict, dr_key=dr_key)
devset.to_csv('../data/pair_datas_src/pair_devset_more_values.tsv', sep='\t', index=None)

100%|██████████| 25603/25603 [00:35<00:00, 725.74it/s]
100%|██████████| 25603/25603 [00:32<00:00, 780.91it/s]
100%|██████████| 25603/25603 [00:32<00:00, 784.13it/s]


In [27]:
devset = pd.read_csv('../data/pair_datas_src/pair_devset_more_values.tsv', sep='\t')
devset = get_max_score_tids(devset)
devset = devset.to_csv('../data/pair_datas_src/pair_devset_more_values.tsv', sep='\t', index=None)

100%|██████████| 50/50 [00:00<00:00, 602.34it/s]
100%|██████████| 25603/25603 [00:00<00:00, 292731.42it/s]


In [9]:
train_df = pd.read_csv('../data/train.tsv', sep='\t')
trainset = pd.read_csv('../data/pair_datas_src/pair_trainset.tsv', sep='\t')
for dr_key in ['MRR100', 'P1', 'NDCG3']:
    trainset = add_dr_score(train_df, trainset, new_train_eval_dict, dr_key=dr_key)
trainset.to_csv('../data/pair_datas_src/pair_trainset_more_values.tsv', sep='\t', index=None)

100%|██████████| 95761/95761 [02:38<00:00, 603.41it/s]
100%|██████████| 95761/95761 [02:40<00:00, 598.14it/s]
100%|██████████| 95761/95761 [02:38<00:00, 604.69it/s]


In [28]:
trainset = pd.read_csv('../data/pair_datas_src/pair_trainset_more_values.tsv', sep='\t')
trainset = get_max_score_tids(trainset)
trainset = trainset.to_csv('../data/pair_datas_src/pair_trainset_more_values.tsv', sep='\t', index=None)

100%|██████████| 187/187 [00:00<00:00, 583.59it/s]
100%|██████████| 95761/95761 [00:00<00:00, 428178.55it/s]


In [19]:
devset_for_test = pd.read_csv('../data/pair_datas/pair_devset_for_test.tsv', sep='\t')
devset_for_test[dr_key] = 0.
devset_for_test.to_csv('../data/pair_datas/pair_devset_for_test.tsv', sep='\t', index=None)

In [20]:
testset = pd.read_csv('../data/pair_datas/pair_testset.tsv', sep='\t')
testset[dr_key] = 0.
testset.to_csv('../data/pair_datas/pair_testset.tsv', sep='\t', index=None)

In [41]:
len(devset[devset['MRR100'] > 0.])

2527

In [25]:
devset_for_test = pd.read_csv('../data/pair_datas_src/pair_devset_for_test_all.tsv', sep='\t')
testset = pd.read_csv('../data/pair_datas_src/pair_testset_all.tsv', sep='\t')
for dr_key in ['MRR100', 'P1', 'NDCG3', 'top1_label']:
    devset_for_test[dr_key] = 0.
    testset[dr_key] = 0.
devset_for_test.to_csv('../data/pair_datas_src/pair_devset_for_test_all.tsv', sep='\t', index=None)
testset.to_csv('../data/pair_datas_src/pair_testset_all.tsv', sep='\t', index=None)

In [7]:
trainset = pd.read_csv('../data/pair_datas_src/pair_trainset_more_values.tsv', sep='\t')
devset = pd.read_csv('../data/pair_datas_src/pair_devset_more_values.tsv', sep='\t')

In [25]:
from collections import Counter
tid2pos = {}
tid2neg = {}
for tid, p1, p2, label, label2 in trainset[['tid', 'initial_request', 'question', 'label', 'top1_label']].values:
    if tid not in tid2pos:
        tid2pos[tid] = []
    if tid not in tid2neg:
        tid2neg[tid] = []
    if label2 == 0:
        tid2neg[tid].append((p1, p2, label2))
    elif label2 == 1:
        tid2pos[tid].append((p1, p2, label2))

all_datas = []
for tid in trainset['tid'].unique():
    pos = tid2pos[tid]
    neg = tid2neg[tid][:len(pos)*5]
    for p1, p2, label2 in pos + neg:
        p1 = p1.lower()
        p2 = p2.lower()
        data = {'tid': tid, 'p1': p1, 'p2': p2}
        data['label'] = label2
        all_datas.append(data)

In [26]:
len(all_datas), len([_d for _d in all_datas if _d['label'] == 1])

(1668, 278)

# build Clarification Need Data

In [47]:
def build_need_data(trainset, is_test=False):
    if is_test:
        cneed_trainset = trainset[['topic_id', 'initial_request']].drop_duplicates().reset_index(drop=True)
        cneed_trainset['label'] = 0
    else:
        cneed_trainset = trainset[['topic_id', 'initial_request', 'clarification_need']].drop_duplicates().reset_index(drop=True)
        cneed_trainset.rename(columns={'clarification_need': 'label'}, inplace=True)
#     has_noq = []
#     for tid in cneed_trainset['topic_id']:
#         qids = trainset[trainset['topic_id'] == tid]['question']
#         noq = qids.isnull().values.any()
#         has_noq.append(noq)
#     cneed_trainset['has_noq'] = has_noq
    return cneed_trainset

In [48]:
cneed_trainset = build_need_data(trainset)
cneed_trainset.to_csv('../data/sent_datas/sent_trainset.tsv', sep='\t', index=None)

In [49]:
cneed_devset = build_need_data(devset)
cneed_devset.to_csv('../data/sent_datas/sent_devset.tsv', sep='\t', index=None)

In [50]:
cneed_testset = build_need_data(testset, is_test=True)
testset.rename(columns={'initial request': 'initial_request'}, inplace=True)
cneed_testset.to_csv('../data/sent_datas/sent_testset.tsv', sep='\t', index=None)

In [54]:
cneed_testset['rlen'] = cneed_testset['initial_request'].apply(len)
cneed_testset.describe()

Unnamed: 0,topic_id,label,rlen
count,61.0,61.0,61.0
mean,251.983607,0.0,34.868852
std,30.060767,0.0,10.514237
min,201.0,0.0,8.0
25%,225.0,0.0,28.0
50%,255.0,0.0,34.0
75%,276.0,0.0,42.0
max,300.0,0.0,67.0


In [58]:
cneed_devset.groupby(by=['label']).count()

Unnamed: 0_level_0,topic_id,initial_request,rlen
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4,4,4
2,21,21,21
3,16,16,16
4,9,9,9


In [28]:
traindf = pd.read_csv('../data/sent_datas/sent_trainset.tsv', sep='\t')
devdf = pd.read_csv('../data/sent_datas/sent_devset.tsv', sep='\t')
alldf = pd.concat([traindf, devdf])

In [30]:
alldf.head(5)

Unnamed: 0,topic_id,initial_request,label
0,1,Tell me about Obama family tree.,2
1,102,What is Fickle Creek Farm,2
2,105,Tell me about sonoma county medical services.,2
3,108,Tell me about of Ralph Owen Brester.,1
4,109,I'm looking for information about mayo clinic ...,2


In [31]:
cnt = alldf[['topic_id', 'label']].groupby(by='label').count().values.reshape(-1)
cnt = np.concatenate([[0], cnt])
cntsum = np.sum(cnt)
prior = cnt / cntsum

In [32]:
prior

array([0.        , 0.12236287, 0.40084388, 0.32911392, 0.14767932])