# 1. Explore CLEAR raw data

## 1.0 Utils

In [None]:
def df_to_json(df, path, name):
  timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  file_name = f"{path}/{name}_{timestamp}.json"
  df.to_json(file_name, orient='records')
  print(f"{name} saved to file: {file_name}")

In [None]:
def read_json_with_frozenset_column(path, frozenset_column_name):
  df = pd.read_json(path, orient='records')
  df[frozenset_column_name] = df[frozenset_column_name].apply(lambda x: frozenset(x))
  return df

## 1.1 Requirements

In [None]:
import pandas as pd
import os
import json
import ast
import random
import csv

from random import sample
from datetime import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/Moshiii/CLEAR-replication.git

## 1.2 Explore BIKER_train.QApair.csv

In [None]:
qa_dataset = pd.read_csv("CLEAR-replication/data/BIKER_train.QApair.csv")
qa_dataset = qa_dataset[['title', 'answer']]
qa_dataset.head()

In [None]:
qa_dataset.shape

In [None]:
print(type(qa_dataset))

In [None]:
question = list(qa_dataset['title'].unique())
len(question)

In [None]:
answer = list(qa_dataset['answer'].unique())
len(answer)

In [None]:
# remove duplicate sample
qa_dataset = qa_dataset.drop_duplicates(subset=['title'], keep='first')
qa_dataset.shape

## 1.3 Explore Biker_test_filtered.csv

In [None]:
biker_test = pd.read_csv("CLEAR-replication/data/Biker_test_filtered.csv")
biker_test = biker_test[['title', 'answer']]
biker_test.head()

In [None]:
biker_test.shape

In [None]:
# drop duplicate samples
biker_test = biker_test.drop_duplicates(subset=['title'], keep='first')
biker_test.shape

In [None]:
queries = biker_test["title"].to_list()
queries_answers = biker_test["answer"].to_list()
queries_answers=[str(list(eval(x))) for x in queries_answers]

In [None]:
len(queries)

### 1.3.1 Check duplicate between qa_dataset and biker_test

In [None]:
dup = qa_dataset.merge(biker_test, on=['title', 'answer'])

In [None]:
if not dup.empty:
  print(len(dup)  )
else:
  print("no dup")

### 1.3.2 Check duplicate between Answer_dict and biker_test answer

In [None]:
data_folder = 'CLEAR-replication/data/full_data_min_5_max_10_ir_10'

In [None]:
Answers_dict = {}
collection_filepath = os.path.join(data_folder, 'Answers_dict.json')
with open(collection_filepath, 'r', encoding='utf8') as fIn:
  Answers_dict = json.load(fIn)

In [None]:
unique_answers = []

Answers_values = Answers_dict.values()
for v in Answers_values:
  unique_answers.append(v)

unique_answers = list(set(unique_answers))
print(unique_answers)

In [None]:
print(queries_answers)

In [None]:
list1 = queries_answers
list2 = unique_answers

# Convert string representations to actual lists
list1 = [eval(item) for item in list1]
list2 = [eval(item) for item in list2]

# Convert lists to sets for efficient duplicate checking
set1 = set(tuple(item) for item in list1)
set2 = set(tuple(item) for item in list2)

# Find duplicates by taking the intersection of the sets
duplicates = set1.intersection(set2)

# Convert the duplicates back to lists
duplicates = [list(item) for item in duplicates]

# Print the duplicates
print("Duplicates numbers:", len(duplicates))

## 1.4 ----------Combine QA dataset and biker test

In [None]:
pair_dataset = pd.concat([qa_dataset, biker_test], axis=0, ignore_index=True)
pair_dataset.shape

In [None]:
# drop duplicate samples
pair_dataset_filtered = pair_dataset.drop_duplicates(subset=['title'], keep='first')
pair_dataset_filtered.shape

In [None]:
all_answers = list(pair_dataset_filtered['answer'].unique())
print(len(all_answers))

=> biker_test file answers are all in the training data

## 1.5 ------------Filter out multi API sample

In [None]:
mul_dataset = pair_dataset_filtered[pair_dataset_filtered['answer'].str.contains(",")]
mul_dataset

In [None]:
print(mul_dataset.shape)

## 1.6 Explore SO_test_filtered.csv

In [None]:
SO_test = pd.read_csv("CLEAR-replication/data/SO_test_filtered.csv")
SO_test = SO_test[['title', 'answer']]
SO_test.head()

In [None]:
SO_test.shape

In [None]:
SO_test_filtered = SO_test.drop_duplicates(subset=['title'], keep='first')
SO_test_filtered.shape

In [None]:
pair_dataset1 = pd.concat([pair_dataset_filtered, SO_test_filtered], axis=0, ignore_index=True)
pair_dataset1.shape

In [None]:
pair_dataset1_filtered = pair_dataset1.drop_duplicates(subset=['title'], keep='first')
pair_dataset1_filtered.shape

=> SO_test doesn't have any duplicate samples with (bikertrain+bikertest)
note: bikertrain has some duplicate sample with bikertest

In [None]:
pair1_filtered_answers = list(pair_dataset1_filtered['answer'].unique())
print(len(pair1_filtered_answers))

=> SO_test's answers are all already in training data

## 1.7 Explore test_queries_min_5_max_10_ir_10.csv

In [None]:
test_queries = pd.read_csv('CLEAR-replication/data/test_queries_min_5_max_10_ir_10.csv')
test_queries = test_queries[['title', 'answer']]
test_queries.head()

In [None]:
test_queries.shape

In [None]:
test_queries_filtered = test_queries.drop_duplicates(subset=['title'], keep='first')
test_queries_filtered.shape

In [None]:
pair_dataset2 = pd.concat([pair_dataset_filtered, test_queries_filtered], axis=0, ignore_index=True)
pair_dataset2.shape

In [None]:
pair_dataset2_filtered = pair_dataset2.drop_duplicates(subset=['title'], keep='first')
pair_dataset2_filtered.shape

=> test_queries is a subset of training data

In [None]:
pair2_filtered_answers = list(pair_dataset2['answer'].unique())
print(len(pair2_filtered_answers))

=> test_queries's answers are all already in training data

## 1.8 Explore test_queries_multi_min_5_max_10_ir_10.csv

In [None]:
test_mul_queries = pd.read_csv('CLEAR-replication/data/test_queries_multi_min_5_max_10_ir_10.csv')
test_mul_queries = test_mul_queries[['title', 'answer']]
test_mul_queries.head()

=> It's not multi queries for sure

In [None]:
test_mul_queries.shape

# 2. Explore CLEAR processed data

In [None]:
processed_data_folder = 'CLEAR-replication/data/full_data_min_5_max_10_ir_10'

## 2.0 Replicate CLEAR data processing phase

In [None]:
raw_data_folder = 'CLEAR-replication/data'
evaluate_data_folder = 'drive/MyDrive/Lab RISE/CLEAR/data/evaluate'
output_data_folder = 'drive/MyDrive/Lab RISE/CLEAR/data/output'

In [None]:
# already run, this code is to create BIKER_querys_final
import csv
import re
import os

# Input and output file paths
input_file = os.path.join(evaluate_data_folder, 'querys_text.txt')
output_file = os.path.join(os.path.dirname(input_file), 'BIKER_querys_final.csv')

# Define a list to store the data
data = []

# Read the input text file
with open(input_file, 'r') as f:
    lines = f.readlines()

querys = list()

# Process the lines and extract titles and answers
for line in lines:
    if not line:
        break
    if len(line) <= 2 or '$$$$$' not in line:
        continue

    line = line.replace('\n','')
    line = line.split('$$$$$')

    title = line[0].strip()
    if title[0:2] == '**':
        continue
    if title[-1] == '?':
        title = title[:-1]
    apis = line[1].split(' ')
    apis_list = list()
    for api in apis:
        if len(api.strip())>1:
            apis_list.append(api)
    querys.append((title,apis_list))

# Write the data to a CSV file
with open(output_file, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['', 'title', 'answer'])
    for i, (title, apis_list) in enumerate(querys):
        csvwriter.writerow([i, title, str(apis_list)])

print("CSV file has been created: ", output_file)

In [None]:
df_train1 = pd.read_csv(os.path.join(raw_data_folder, 'BIKER_train.QApair.csv'))
df_train1 = df_train1[['title', 'answer']]
df_train1.shape

In [None]:
df_train1 = df_train1.drop_duplicates(subset=['title'], keep='first')
df_train1.shape

In [None]:
df_train3 = pd.read_csv(os.path.join(evaluate_data_folder, 'BIKER_querys_final.csv'))
df_train3 = df_train3[['title', 'answer']]
df_train3.shape

In [None]:
df_train3 = df_train3.drop_duplicates(subset=['title'], keep='first')
df_train3.shape

In [None]:
df3_multi=df_train3[df_train3['answer'].str.contains(",")]
len(df3_multi)

In [None]:
df_train = [df_train1, df_train3]
df_train = pd.concat(df_train)
df_train.shape

In [None]:
df_train = df_train.drop_duplicates(subset=['title'], keep='first')
df_train.shape

In [None]:
df_multi=df_train[df_train["answer"].str.contains(",")]
len(df_multi)

In [None]:
# moshii method to check all answer and it's wrong
len(list(set(df_train["answer"].to_list())))

In [None]:
df_train_answer_list = df_train['answer'].to_list()
# len(df_train_answer_list)

df_train_answer_set = set()
df_train_answer_set_list = []

for df_train_answer in df_train_answer_list:
  df_train_answer_set.add(frozenset(ast.literal_eval(df_train_answer)))
  df_train_answer_set_list.append(frozenset(ast.literal_eval(df_train_answer)))

len(df_train_answer_set)

In [None]:
df_train['set_answer']=df_train_answer_set_list
df_train

In [None]:
df_to_json(df_train, output_data_folder, 'df_train_with_set_answer')

In [None]:
# test
# Create a set of frozensets
set_of_sets = {frozenset([1, 2, 3]), frozenset([3, 4, 5]), frozenset([1, 2, 3])}

set_of_sets.add(frozenset([3, 4, 5]))

# Print the set of frozensets
print(set_of_sets)

In [None]:
# test
print(df_train['answer'].dtypes)

In [None]:
# test
print(['abc', 'def']==['def', 'abc'])

In [None]:
# test
import pandas as pd
import ast

# Sample DataFrame with your data
data = {
    'answer': ["['abc', 'def']", "['def', 'abc']", "['abc', 'def', 'ghi']"]
}
df = pd.DataFrame(data)

# Function to check if a list is equivalent to another list
def are_lists_equivalent(s1, s2):
    try:
        list1 = ast.literal_eval(s1)
        list2 = ast.literal_eval(s2)
        return set(list1) == set(list2)
    except (SyntaxError, ValueError):
        return False

# Count the number of rows where "answer" lists are equivalent to any other list
count = 0
for i, row in df.iterrows():
    for j, other_row in df.iterrows():
        if i != j and are_lists_equivalent(row["answer"], other_row["answer"]):
            count += 1

print("Number of rows with 'answer' lists that are equivalent to others:", count)


In [None]:
# test
import pandas as pd

data = {'column_of_sets': [frozenset([1, 2, 3]), frozenset([2, 3, 4]), frozenset([1, 2, 3]), frozenset([4, 5])]}
df = pd.DataFrame(data)

# Grouping by the column_of_sets
grouped = df.groupby('column_of_sets')

# Iterate through the groups
for group, group_df in grouped:
    print("Group:", group)
    print(group_df)
    print("\n")

In [None]:
def generate_random_sampling(qa_dataframe, minPositive=None, noPositive=None, noNegative=None):
  valid_api=[]
  Corpus_dict = {}
  Answers_dict = {}
  Passage_dict = {}

  gps_by_answer = qa_dataframe.groupby('set_answer')
  invalid_api_cnt = 0

  for k,v in gps_by_answer:
    if len(v)>=minPositive:
      valid_api.append(k)
      print(k, ':', len(v)) # check number of each valid sample (a sample with the answer appears at least minPositive time)
    else:
      invalid_api_cnt += 1

  # print(api_cnt, invalid_api_cnt)
  print('number of valid api:', len(valid_api))

  qa_dataframe_filtered = qa_dataframe[qa_dataframe['set_answer'].isin(valid_api)].reset_index(drop=True)
  display(qa_dataframe_filtered)

  gps_by_answer_filtered = qa_dataframe_filtered.groupby('set_answer')

  title_list = qa_dataframe_filtered['title'].to_list()
  answer_list = qa_dataframe_filtered['set_answer'].to_list()
  print('len title_list:', len(title_list))
  print('len answer_list:', len(answer_list))

  for idx, tmp in enumerate(title_list):
    Corpus_dict[idx] = tmp
  for idx, tmp in enumerate(answer_list):
    Answers_dict[idx] = tmp

  for idx in range(len(title_list)):
    label = Answers_dict[idx]

    gp_of_label = gps_by_answer_filtered.get_group(label)

    same_api_idx_gp = list(gp_of_label.index)

    diff_api_idx_gp = list(set(range(len(answer_list)))-set(same_api_idx_gp))

    if len(same_api_idx_gp)>noPositive:
      same_api_idx_gp = sample(same_api_idx_gp, noPositive)

    diff_api_idx_gp = sample(diff_api_idx_gp, noNegative)

    Passage_dict[idx] = [same_api_idx_gp, diff_api_idx_gp]

  print('len Corpus_dict', len(Corpus_dict))
  print('len Answers_dict', len(Answers_dict))
  print('len Passage_dict', len(Passage_dict))
  display(Passage_dict)
  return qa_dataframe_filtered, Corpus_dict, Answers_dict, Passage_dict


In [None]:
df_train_filtered, Corpus_dict, Answers_dict, Passage_dict = generate_random_sampling(df_train, 5, 10, 10)
# test generate_random_sampling func

In [None]:
def get_triplets(Passage_dict):
    Triplets= []
    for k, v in Passage_dict.items():
        for x in v[0]:
            for y in v[1]:
                Triplets.append([k,x,y])
    return Triplets

In [None]:
def get_rel_doc(df,evaluate_queries, Answers_dict):
    rel_dict ={}
    gps = df.groupby("set_answer")

    for q in evaluate_queries:
        label = Answers_dict[q]

        gp = gps.get_group(label)

        same_api_idx_gp = list(gp.index)
#         print(same_api_idx_gp)
#         print(q)
        same_api_idx_gp = list(set(same_api_idx_gp)-set([q]))

        rel_dict[q] = [same_api_idx_gp]
    return rel_dict

In [None]:
# haven't done
# read the full data stored part of moshii
evaluate_queries = sample(list(Passage_dict.keys()),1000)
print("evaluate_queries",len(evaluate_queries))
evaluate_Corpus = list(set(range(len(Answers_dict)))-set(evaluate_queries))
evaluate_rel_doc = get_rel_doc(df_train_filtered,evaluate_queries,Answers_dict)

with open(f"{evaluate_data_folder}/evaluate_queries.json", 'w') as jsonfile:
    json.dump(evaluate_queries, jsonfile)

with open(evaluate_data_folder+'evaluate_Corpus'+str(i)+'.json', 'w') as jsonfile:
    json.dump(evaluate_Corpus, jsonfile)

# -------------------------------END---------------------------------------

In [None]:
import random
from random import sample
random.seed(1)


def generate_random_sampling_min_m_max_n(df,m=None,p=None,n=None):
    Passage_dict = {}
    Corpus_dict = {}
    Answers_dict = {}
    Triplets= []
    api_list=[]

    gps = df.groupby("set_answer")

    for k,v in gps:
        if len(v)>=m:
            api_list.append(k)
    df = df[df["answer"].isin(api_list)]
    df = df.reset_index(drop=True)

    print("len train",len(df))

    gps = df.groupby("answer")

    title_list = df["title"].to_list()
    answer_list = df["answer"].to_list()

    print("len title_list",len(title_list))
    print("len answer_list",len(answer_list))

    for idx,t in enumerate(title_list):
        Corpus_dict[idx] = t

    for idx,t in enumerate(answer_list):
        Answers_dict[idx] = t

    counter=0

    for idx in range(len(title_list)):
        counter+=1
        if counter %10000 == 0: print(counter)
        label = Answers_dict[idx]

        gp = gps.get_group(label)

        same_api_idx_gp = list(gp.index)
        diff_api_idx_gp = list(set(range(len(answer_list)))-set(same_api_idx_gp))

        if len(same_api_idx_gp)>p:
            same_api_idx_gp=sample(same_api_idx_gp,p)

        diff_api_idx_gp=sample(diff_api_idx_gp,n)

        Passage_dict[idx] = [same_api_idx_gp,diff_api_idx_gp]

    print("len Corpus_dict",len(Corpus_dict))
    print("len Answers_dict",len(Answers_dict))
    print("len Passage_dict",len(Passage_dict))
    return df,Corpus_dict,Answers_dict,Passage_dict

def get_triplets(Passage_dict):
    Triplets= []
    for k, v in Passage_dict.items():
        for x in v[0]:
            for y in v[1]:
                Triplets.append([k,x,y])
    return Triplets

def get_rel_doc(df,evaluate_queries, Answers_dict): # get all the relevant documents (posts with the same answer) for each query in the evaluation set
    rel_dict ={}
    gps = df.groupby("answer")

    for q in evaluate_queries:
        label = Answers_dict[q]

        gp = gps.get_group(label)

        same_api_idx_gp = list(gp.index)
#         print(same_api_idx_gp)
#         print(q)
        same_api_idx_gp = list(set(same_api_idx_gp)-set([q]))

        rel_dict[q] = [same_api_idx_gp]
    return rel_dict

In [None]:
p=10
n=10
df_train_filtered,Corpus_dict,Answers_dict,Passage_dict = generate_random_sampling_min_m_max_n(df_train,m=5,p=p,n=n)

In [None]:
# test
df_train_answer_dict = {}

df_train_answer = list(set(df_train["answer"].to_list()))

for idx, t in enumerate(df_train_answer):
  df_train_answer_dict[idx] = t

with open(evaluate_data_folder+'/df_train_answer.json', 'w') as jsonfile:
    json.dump(df_train_answer_dict, jsonfile)
print("df_train_answer.json has been created")

## 2.1 Explore Answers_dict

In [None]:
Answers_dict_json = {}
collection_filepath = os.path.join(processed_data_folder, 'Answers_dict.json')
with open(collection_filepath, 'r', encoding='utf8') as fIn:
  Answers_dict_json = json.load(fIn)

In [None]:
len(Answers_dict_json)

In [None]:
Answers_dict_json

In [None]:
# not use :D
# convert strings to lists
parsed_answers_dict = {k: eval(v) for k, v in Answers_dict.items()}
parsed_answers_dict

In [None]:
# count number of unique answer
unique_answers_cnt = 0
seen_answers = set()
for k, v in Answers_dict.items():
  if v not in seen_answers:
    seen_answers.add(v)
    unique_answers_cnt += 1
print(unique_answers_cnt)

# 3. Process data

## 3.1 Requirements

In [None]:
import random
from random import sample
random.seed(1)

## 3.2 Generate random sampling

In [None]:
def generate_random_sampling(qa_dataframe, minPositive=None, noPositive=None, noNegative=None):
  valid_api=[]
  Corpus_dict = {}
  Answers_dict = {}
  Passage_dict = {}

  gps_by_answer = qa_dataframe.groupby('answer')
  invalid_api_cnt = 0

  for k,v in gps_by_answer:
    if len(v)>=minPositive:
      valid_api.append(k)
      print(k, ':', len(v)) # check number of each valid sample (a sample with the answer appears at least minPositive time)
    else:
      invalid_api_cnt += 1

  # print(api_cnt, invalid_api_cnt)
  print('number of valid api:', len(valid_api))

  qa_dataframe_filtered = qa_dataframe[qa_dataframe['answer'].isin(valid_api)].reset_index(drop=True)
  display(qa_dataframe_filtered)

  gps_by_answer_filtered = qa_dataframe_filtered.groupby('answer')

  title_list = qa_dataframe_filtered['title'].to_list()
  answer_list = qa_dataframe_filtered['answer'].to_list()
  print('len title_list:', len(title_list))
  print('len answer_list:', len(answer_list))

  for idx, tmp in enumerate(title_list):
    Corpus_dict[idx] = tmp
  for idx, tmp in enumerate(answer_list):
    Answers_dict[idx] = tmp

  for idx in range(len(title_list)):
    label = Answers_dict[idx]

    gp_of_label = gps_by_answer_filtered.get_group(label)

    same_api_idx_gp = list(gp_of_label)
    diff_api_idx_gp = list(set(range(len(answer_list)))-set(same_api_idx_gp))

    if len(same_api_idx_gp)>noPositive:
      same_api_idx_gp = sample(same_api_idx_gp, noPositive)

    diff_api_idx_gp = sample(diff_api_idx_gp, noNegative)

    Passage_dict[idx] = [same_api_idx_gp, diff_api_idx_gp]

  print('len Corpus_dict', len(Corpus_dict))
  print('len Answers_dict', len(Answers_dict))
  print('len Passage_dict', len(Passage_dict))
  return qa_dataframe_filtered, Corpus_dict, Answers_dict, Passage_dict


In [None]:
# test generate_random_sampling func
df_train1, Corpus_dict1, Answers_dict1, Passage_dict1 = generate_random_sampling(pair_dataset_filtered, 10, 10, 10)

In [None]:
# test generate_random_sampling func
df_train, Corpus_dict, Answers_dict, Passage_dict = generate_random_sampling(pair_dataset_filtered, 5, 10, 10)

=> only 955 api is valid (which is having more than 4 queries about it)

=> Answers_dict.json also has 955 unique answers. Interesting!

In [None]:
print(len(list(pair_dataset_filtered['answer'].unique())))

## 3.3 Get triplets

In [None]:
def get_triplets(Passage_dict):
    Triplets= []
    for k, v in Passage_dict.items():
        for x in v[0]:
            for y in v[1]:
                Triplets.append([k,x,y])
    return Triplets

## 3.4 Get relevant documents for each query

Get all the relevant documents (posts with the samme answer) for  each query in the evaluation set

In [None]:
def get_rel_doc(df,evaluate_queries, Answers_dict): # get all the relevant documents (posts with the same answer) for each query in the evaluation set
    rel_dict ={}
    gps = df.groupby("answer")

    for q in evaluate_queries:
        label = Answers_dict[q]

        gp = gps.get_group(label)

        same_api_idx_gp = list(gp.index) #get all index of sample with same api
#         print(same_api_idx_gp)
#         print(q)
        same_api_idx_gp = list(set(same_api_idx_gp)-set([q])) #remove itself index from the list

        rel_dict[q] = [same_api_idx_gp]
    return rel_dict

In [None]:
# test
print(set(range(5))-set())

## 3.5 Create evaluate set

In [None]:
evaluate_queries = sample(list(Passage_dict.keys()),1000)
print("evaluate_queries",len(evaluate_queries))
evaluate_Corpus = list(set(range(len(Answers_dict)))-set(evaluate_queries))
evaluate_rel_doc = get_rel_doc(df_train_filtered,evaluate_queries,Answers_dict)

folder = "random_dis_query"
if not os.path.exists(folder):
    os.mkdir(folder)
data_foler = folder+"/"

for i in range(10):
    evaluate_queries = sample(list(Passage_dict.keys()),1000)
    print("evaluate_queries",len(evaluate_queries))
    evaluate_Corpus = list(set(range(len(Answers_dict)))-set(evaluate_queries))
    evaluate_rel_doc = get_rel_doc(df_train_filtered,evaluate_queries,Answers_dict)
    with open(data_foler+'evaluate_queries'+str(i)+'.json', 'w') as jsonfile:
        json.dump(evaluate_queries, jsonfile)

    with open(data_foler+'evaluate_Corpus'+str(i)+'.json', 'w') as jsonfile:
        json.dump(evaluate_Corpus, jsonfile)


In [None]:
# moshi code, paste here just to see and verify with my own code
def generate_random_sampling_min_m_max_n(df,m=None,p=None,n=None):
    Passage_dict = {}
    Corpus_dict = {}
    Answers_dict = {}
    Triplets= []
    api_list=[]

    gps = df.groupby("answer")

    for k,v in gps:
        if len(v)>=m:
            api_list.append(k)
    df = df[df["answer"].isin(api_list)]
    df = df.reset_index(drop=True)

    print("len train",len(df_train))

    gps = df.groupby("answer")

    title_list = df["title"].to_list()
    answer_list = df["answer"].to_list()

    print("len title_list",len(title_list))
    print("len answer_list",len(answer_list))

    for idx,t in enumerate(title_list):
        Corpus_dict[idx] = t

    for idx,t in enumerate(answer_list):
        Answers_dict[idx] = t

    counter=0

    for idx in range(len(title_list)):
        counter+=1
        if counter %10000 == 0: print(counter)
        label = Answers_dict[idx]

        gp = gps.get_group(label)

        same_api_idx_gp = list(gp.index)
        diff_api_idx_gp = list(set(range(len(answer_list)))-set(same_api_idx_gp))

        if len(same_api_idx_gp)>p:
            same_api_idx_gp=sample(same_api_idx_gp,p)

        diff_api_idx_gp=sample(diff_api_idx_gp,n)

        Passage_dict[idx] = [same_api_idx_gp,diff_api_idx_gp]

    print("len Corpus_dict",len(Corpus_dict))
    print("len Answers_dict",len(Answers_dict))
    print("len Passage_dict",len(Passage_dict))
    return df,Corpus_dict,Answers_dict,Passage_dict