# Loading wikipedia bios

## loading raw identities

In [1]:
import pandas as pd
from ast import literal_eval


df = pd.read_csv('FinalDataFrame5.csv')
df.identities = df.identities.apply(literal_eval)
bios = list(df['identities'])

bios[:5]

In [None]:
# The rest is similar to twitter bios approach

# Loading twitter bios

## load all raw data

In [1]:
import pickle
with open('/user/smadani/navid/data/pis2020.pkl', 'rb') as f:
    bios = pickle.load(f)
    
bios[100:110]

## PI frequency and top PIs

In [40]:
from collections import Counter
cntr = Counter()

for bio in bios:
    cntr.update(bio)

print(len(bios))

In [41]:
import numpy as np

freqs = list(cntr.values())
print(f"percentile freq: {np.percentile(freqs, 99)} mean freq: {np.mean(freqs)}")

In [42]:
most_frequent_pis = {k:v for k,v in cntr.items() if v > 500}
len(most_frequent_pis)

In [43]:
import pandas as pd

df = pd.DataFrame({'pi': list(most_frequent_pis.keys()), 'cnt': list(most_frequent_pis.values())})
df = df.sort_values(by=['cnt'], ascending=False)

In [44]:
df.to_csv('twitter_most_frequent_pis.csv', index=False, header=True)

### most frequent neighbors

In [45]:
from tqdm import tqdm
neighbor_cnt = {}

for bio in tqdm(bios):
    for pi in bio:
        if pi in most_frequent_pis:
            if pi not in neighbor_cnt:
                neighbor_cnt[pi] = Counter()
                
            rest = [b for b in bio if b!=pi and b in most_frequent_pis]
            neighbor_cnt[pi].update(rest)

print(len(neighbor_cnt))            

#post processing and pruning empty adjacencies

for pi, adj in neighbor_cnt.copy().items():
    if len(adj) < 2:
        neighbor_cnt.pop(pi)
    
print(f"size after pruning: {len(neighbor_cnt)}")        

### calculating tf-idf

In [46]:
for pi, neighs in tqdm(neighbor_cnt.items()):
    for phrase in neighs.keys():
        neighs[phrase] /= most_frequent_pis[phrase]
        

### calculating using bi-partite method

In [23]:
from scipy import sparse, io

pi_idx = {}
for pi in most_frequent_pis:
    pi_idx[pi] = len(pi_idx)

#creating bipartite matrix
usr_pi = []

for bio in bios:
    cur_usr_pis = []
    for pi in bio:
        if pi in most_frequent_pis:
            cur_usr_pis.append(pi_idx[pi])
    if len(cur_usr_pis) < 2:
        continue
    usr_pi.append(cur_usr_pis)
    

print(f"original users: {len(bios)}")
print(f"no of users after pruning: {len(usr_pi)}")


In [None]:
usrs = []
pis = []
scores = []

for uid, pis in enumerate(usr_pi):
    for pi in pis:
        usrs.append(uid)
        pis.append(pi)
        scores.append(1)

In [None]:
%% time

BP_MATRIX_FILENAME = "./bipartite_pi.mtx"
output_matrix = sparse.coo_matrix((scores, (usrs, pis)))
io.mmwrite(BP_MATRIX_FILENAME, output_matrix)

In [None]:
!du -hs ./bipartite_pi.mtx

In [3]:
from scipy import io

io.mmread(BP_MATRIX_FILENAME, output_matrix)

In [14]:
scores

In [15]:
!rm ./bipartite_pi.mtx.gz
!gzip ./bipartite_pi.mtx
!ls

In [20]:
import sys

sys.path.append("../bipartite-pairs/python-scoring/")
import score_data

BP_SCORING_OUTPUT = './bipartite_output.csv.gz'
score_data.score_only(
    BP_MATRIX_FILENAME+".gz",
    ['weighted_corr_exp'],
    BP_SCORING_OUTPUT,
)

In [None]:
!zcat ./bipartite_output.csv.gz

In [2]:
import pandas as pd

BP_SCORING_OUTPUT = './bipartite_output.csv.gz'
df = pd.read_csv(BP_SCORING_OUTPUT)
df.head()

In [None]:
# recreate the neighboring dictionary



### save 

In [47]:
import pandas as pd

pis = []
positives = []
negatives = []

for pi, cntr in tqdm(neighbor_cnt.items()):
    cur_neg = [x for x in most_frequent_pis if x not in neighbor_cnt[pi]]
    if len(cur_neg) > 20:
        cur_neg = list(np.random.choice(cur_neg, size=20, replace=False))
    cur_pos = [x[0] for x in neighbor_cnt[pi].most_common(5)]
    if len(cur_neg) < 4 or len(neighbor_cnt[pi])<2:
        print(f"PASSING PI: {pi}")
        continue
    positives.append(cur_pos)
    pis.append(pi)
    negatives.append(cur_neg)
                 
print(f"saving {len(pis)} pis")
df = pd.DataFrame({'pis': pis, 'positives': positives, 'negatives': negatives})
df.to_csv('twitter_pi_with_neighbors_tfidf.csv', index=False, header=True)

In [77]:
!wc -l wiki_pi_with_neighbors_standard.csv

In [78]:
!head wiki_pi_with_neighbors_standard.csv

## create test dataset

In [80]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(bios, test_size=0.01, shuffle=True)

In [81]:
import pickle

with open('./data/test_bios.pkl', 'wb') as f:
    pickle.dump(test, f)
    
with open('./data/train_bios.pkl', 'wb') as f:
    pickle.dump(train, f)

# Cleaning data

## phrase cleaning

In [None]:
import pickle

with open('./data/train_bios.pkl', 'rb') as f:
    bios = pickle.load(f)

In [9]:
# build a vocabulary of phrases
from tqdm import tqdm_notebook as tqdm
from collections import Counter

pi_cnt = Counter()
for bio in tqdm(bios):
    pi_cnt.update(bio)

len(pi_cnt)

In [None]:
pi_cnt.most_common(10)

In [None]:
pi_cnt.most_common(len(pi_cnt))[-5:]

## cleaning each bio

In [None]:
from tqdm import tqdm

# phrases of lenght at least 2
# profiles with at least 2 phrases
# pis that's been repeated at least 10 times in dataset

def clean_pis(all_pis):
    result = []
    for pis in tqdm(all_pis):
        current_pi = set()
        for pi in pis:
            if len(pi) >= 2 and pi_cnt[pi] >= 10:
                current_pi.add(pi)
        if len(current_pi) > 1:
            result.append(list(current_pi))
            
    return result
            
cleaned_bios = clean_pis(bios)
print(len(cleaned_bios))

# Contrastive learning

## generating positive negative samples

In [None]:
# from each person's bio I create at most K triplets
from random import randint
import numpy as np

def pair_in_list(current_pair, l):
    for pair in l:
        if current_pair[0] in pair and current_pair[1] in pair:
            return True
    return False

def generate_triplets(bios, k=3):
    samples = []
    for idx, bio in tqdm(enumerate(bios), total=len(bios)):
        iters = min(len(bio)-1, k)
        chosen_pis = []
        for i in range(iters):
            pos1, pos2 = np.random.choice(bio, size=2, replace=False)
            while pair_in_list([pos1,pos2], chosen_pis):
                pos1, pos2 = np.random.choice(bio, size=2, replace=False)
                #print(pos1, pos2, chosen_pis, len(bio), bio)
            chosen_pis.append([pos1, pos2])
            neg_idx = randint(0, len(bios)-1)
            neg_sample = np.random.choice(bios[neg_idx], size=1)[0]
            samples.append([pos1, pos2, neg_sample])
    return samples
            
triplets = generate_triplets(cleaned_bios, k=5)

In [None]:
print(len(triplets))

## save dataset

In [None]:
import pickle 

with open('/user/smadani/navid/data/triplets.pkl', 'wb') as f:
    pickle.dump(triplets, f)

In [10]:
import pickle 

with open('data/triplets.pkl', 'rb') as f:
    triplets = pickle.load(f)

In [14]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(triplets, test_size=0.01, shuffle=True)
train_set, test_set = train_test_split(train_set, test_size=0.01, shuffle=True)

print(f"train size: {len(train_set)}, validation size: {len(valid_set)}, test size: {len(test_set)}")

In [20]:
import csv

# writing the data into the file
with open('data/valid.csv', 'w') as f:   
    write = csv.writer(f, delimiter='\t')
    write.writerows(valid_set)
    
with open('data/train.csv', 'w') as f:   
    write = csv.writer(f, delimiter='\t')
    write.writerows(train_set)

with open('data/test.pckl', 'wb') as f:   
    pickle.dump(test_set, f)

## finetuning

### building the dataset


In [8]:
import csv
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader

with open('data/train.csv', newline='') as f:
    train_examples = []
    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in tqdm(reader):
        train_examples.append(InputExample(texts=[row[0], row[1]], label=1.0))
        train_examples.append(InputExample(texts=[row[0], row[2]], label=0.0))
    

### loading evaluation dataset

In [9]:

from sentence_transformers import evaluation

with open('data/valid.csv', newline='') as f:
    sent1s = []
    sent2s = []
    scores = []
    i = 0
    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in tqdm(reader):
        sent1s.append(row[0])
        sent1s.append(row[0])
        sent2s.append(row[1])
        sent2s.append(row[2])
        scores.append(1.0)
        scores.append(0.0)
        i += 1
evaluator = evaluation.EmbeddingSimilarityEvaluator(sent1s, sent2s, scores)


### creating data loaders

In [20]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
dense_model = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model.add_module('3', dense_model)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)
train_loss = losses.CosineSimilarityLoss(model)

model

### training

In [21]:
output_model_path = 'models/miniLM-L6-finetuned'

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=5,
          evaluation_steps=10000,
          warmup_steps=10000,
          output_path=output_model_path)

## evaluation

### loading saved model

In [1]:
%env CUDA_VISIBLE_DEVICES=1

In [10]:
from sentence_transformers import SentenceTransformer, models

model = SentenceTransformer('./models/miniLM-L6-finetuned/')

model

### calculating encodings for all phrases

In [11]:
pis = set()

for bio in cleaned_bios:
    pis.update(bio)

pis = list(pis)
print(len(pis))

embeddings = model.encode(pis, convert_to_tensor=True)
        

In [25]:
def most_similar(pi, all_pis, all_pi_embs, model, k=11):
    cur_emb = model.encode(pi, convert_to_tensor=True)
    cosine_scores = util.cos_sim(cur_emb, all_pi_embs).detach().cpu().numpy()[0]
    most_similars = np.argsort(cosine_scores)[-k:]
    return [(all_pis[i], cosine_scores[i]) for i in most_similars if pi!=all_pis[i]]

most_similar('mima', pis, embeddings, model, k=50)

In [27]:
def get_similarity(w1, w2, model=model):
    emb1 = model.encode(w1, convert_to_tensor=True)
    emb2 = model.encode(w2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2)

print(
    get_similarity('isfj', 'man'),
    get_similarity('isfj', 'woman'),
    get_similarity('isfj', 'man', model=model),
    get_similarity('isfj', 'woman', model=model),
)

print(
    get_similarity('intj', 'man'),
    get_similarity('intj', 'woman'),
    get_similarity('intj', 'man', model=model),
    get_similarity('intj', 'woman', model=model),
)

print(
    get_similarity('entj', 'man'),
    get_similarity('entj', 'woman'),
    get_similarity('entj', 'man', model=model),
    get_similarity('entj', 'woman', model=model),
)


In [183]:
mom_emb = base_model.encode('intp', convert_to_tensor=True)
dad_emb = base_model.encode('esfj', convert_to_tensor=True)
util.cos_sim(mom_emb, dad_emb)

### loading not tuned model and doing the same thing

In [29]:
from sentence_transformers import SentenceTransformer, models

base_model = SentenceTransformer('all-MiniLM-L6-v2')
base_embs = base_model.encode(pis, convert_to_tensor=True)
base_cosine_scores = util.cos_sim(base_embs, base_embs).detach().cpu().numpy()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

target_word = 'build the wall'
df1 = pd.DataFrame(most_similar(target_word, pis, embeddings, model), columns=['identifier', 'similarity'])
df1['model'] = 'fine-tuned-sentence-bert'
df2 = pd.DataFrame(most_similar(target_word, pis, base_embs, base_model), columns=['identifier', 'similarity'])
df2['model'] = 'original-sentence-bert'


fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,6))

ax1.scatter(x=df1['identifier'], y=df1['similarity'])
ax1.tick_params(axis='x', rotation=-60)
ax1.set_xlabel('phrase')
ax1.set_ylabel('similarity')
ax1.set_title('fine-tuned-sentence-bert')


ax2.scatter(x=df2['identifier'], y=df2['similarity'])
ax2.tick_params(axis='x', rotation=-60)
ax2.set_xlabel('phrase')
ax2.set_ylabel('similarity')
ax2.set_title('original-sentence-bert')

plt.show()

### analyzing personalities

In [139]:
for personality in ['ESTJ', 'ENTJ', 'ESFJ', 'ENFJ', 'ISTJ', 'ISFJ', 'INTJ', 'INFJ', 'ESTP', 'ESFP', 'ENTP', 'ENFP', 'ISTP', 'ISFP', 'INTP', 'INFP']:
    if personality in pis or personality.lower() in pis:
        print(f"{personality}: True")
    else:
        print(f"{personality}: False")

In [179]:
personalities = ['ESTJ', 'ENTJ', 'ESFJ', 'ENFJ', 'ISTJ', 'ISFJ', 'INTJ', 'INFJ', 'ESTP', 'ESFP', 'ENTP', 'ENFP', 'ISTP', 'ISFP', 'INTP', 'INFP']
personalities = [p.lower() for p in personalities]

pers_emb = model.encode(personalities, convert_to_tensor=True)
pers_emb_base = base_model.encode(personalities, convert_to_tensor=True)

base_cosine_scores = util.cos_sim(pers_emb_base, pers_emb_base).detach().cpu().numpy()
cosine_scores = util.cos_sim(pers_emb, pers_emb).detach().cpu().numpy()

In [180]:
plt.figure(figsize=(10,10))
ax = sns.heatmap(cosine_scores)
ax.set_xticklabels(personalities, rotation=90)
ax.set_yticklabels(personalities, rotation=0)

# plt.xticks(ticks=personalities)
# plt.yticks(ticks=personalities)

In [181]:
plt.figure(figsize=(10,10))

ax = sns.heatmap(base_cosine_scores)
ax.set_xticklabels(personalities, rotation=90)
ax.set_yticklabels(personalities, rotation=0)

### comparing in gensim vocab

In [210]:
import gensim.downloader as api

w2v = api.load("glove-wiki-gigaword-50")
w2v.most_similar("glass")

In [214]:
vocab = list(w2v.key_to_index.keys())

vocab_embs = model.encode(vocab, convert_to_tensor=True)


In [215]:
most_similar('vaccine', vocab, vocab_embs, model, k=10)

In [220]:
most_similar('blm', vocab, vocab_embs, model, k=50)

# Word2vec embedding

In [None]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

import pickle

with open('/user/smadani/navid/data/pis2020.pkl', 'rb') as f:
    bios = pickle.load(f)
    
bios[100:110]

In [2]:

print(len(bios))

class Callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

monitor = Callback()
model = Word2Vec(bios, vector_size=256, window=5, min_count=1,
                 negative=10, workers=30, epochs=100, callbacks=[monitor],
                 compute_loss=True)

model.save('./models/w2v.model')

In [1]:
model = Word2Vec.load("./models/w2v.model")

In [3]:
model.wv.most_similar('he', topn=15)

# Downstream tasks

## hold-one-out prediction of PIs

### clean data

In [82]:
import pickle

with open('data/test_bios.pkl', 'rb') as f:
    test_bios = pickle.load(f)

with open('data/train_bios.pkl', 'rb') as f:
    train_bios = pickle.load(f)
    
all_bios = train_bios + test_bios
print(len(all_bios))

In [83]:
# build a vocabulary of phrases
from tqdm import tqdm_notebook as tqdm
from collections import Counter

pi_cnt = Counter()
for bio in tqdm(all_bios):
    pi_cnt.update(bio)

len(pi_cnt)

In [84]:

# phrases of lenght at least 2
# profiles with at least 2 phrases
# pis that's been repeated at least 10 times in dataset

def clean_pis(all_pis):
    result = []
    for pis in tqdm(all_pis):
        current_pi = set()
        for pi in pis:
            if len(pi) >= 2 and pi_cnt[pi] >= 10:
                current_pi.add(pi)
        if len(current_pi) > 1:
            result.append(list(current_pi))
            
    return result
            
cleaned_all_bios = clean_pis(all_bios)
cleaned_test_bios = clean_pis(test_bios)


In [85]:
cleaned_all_bios[:3]

### create dataset

In [86]:
import numpy as np

test_ds = []


for bio in cleaned_test_bios:
    hold_out_idx = np.random.randint(0, len(bio))
    remaining = [x for i, x in enumerate(bio) if i != hold_out_idx]
    remaining = ', '.join(remaining)
    target = bio[hold_out_idx]
    
    test_ds.append((remaining, target))

print(len(test_ds))

In [87]:
all_pis = set()
for bio in cleaned_all_bios:
    for pi in bio:
        all_pis.add(pi)

all_pis = list(all_pis)

In [88]:
print(len(all_pis))

In [89]:
from collections import OrderedDict

pi_dict = OrderedDict()
for p in all_pis:
    pi_dict[p] = len(pi_dict)

In [90]:
print(len(pi_dict))

### load original sentence bert and embeddings

In [91]:
bio_x, bio_y = zip(*test_ds)

In [92]:
%%time

from sentence_transformers import SentenceTransformer, models, util

orig_model = SentenceTransformer('all-MiniLM-L6-v2')

orig_emb_x = orig_model.encode(bio_x, convert_to_tensor=True)
orig_emb_all = orig_model.encode(all_pis, convert_to_tensor=True)


### calculate rank score

In [None]:

for x, y in tqdm(zip(orig_emb_x, bio_y)):
    cosine_scores = util.cos_sim(x, orig_emb_all).detach().cpu().numpy()[0]
    target_idx = pi_dict[y]
    argsort = np.argsort(cosine_scores)
    for i, arg in enumerate(argsort):
        if arg == target_idx:
            rank = len(argsort)-i
            ranks.append(rank)
            break
        if i == len(argsort)-1:
            print(y)
            print("element not available in dict!")
    
    

In [109]:
def calculate_rank(x_emb, y, orig_emb_all):
    cosine_scores = util.cos_sim(x_emb, orig_emb_all)[0]
    target_idx = pi_dict[y]
    argsort = np.argsort(cosine_scores)
    for i, arg in enumerate(argsort):
        if arg == target_idx:
            rank = len(argsort)-i
            return rank
        if i == len(argsort)-1:
            raise Exception()
    

In [122]:
orig_emb_all_np = orig_emb_all.detach().cpu().numpy()
orig_x_emb_np = orig_emb_x.detach().cpu().numpy()
bio_y_ = bio_y

In [123]:
%%time

from joblib import Parallel, delayed

res = Parallel(n_jobs=30)(delayed(calculate_rank)(x_emb, y, orig_emb_all_np) for x_emb, y in zip(orig_x_emb_np, bio_y_))

In [125]:
len([r for r in res if r < 20]), len(res)

### load finetuned sentence bert and embeddings

In [126]:
bio_x, bio_y = zip(*test_ds)

In [127]:
%%time

from sentence_transformers import SentenceTransformer, models, util

fint_model = SentenceTransformer('./models/miniLM-L6-finetuned/')

fint_emb_x = fint_model.encode(bio_x, convert_to_tensor=True)
fint_emb_all = fint_model.encode(all_pis, convert_to_tensor=True)


### calculate rank score

In [None]:

for x, y in tqdm(zip(fint_emb_x, bio_y)):
    cosine_scores = util.cos_sim(x, fint_emb_all).detach().cpu().numpy()[0]
    target_idx = pi_dict[y]
    argsort = np.argsort(cosine_scores)
    for i, arg in enumerate(argsort):
        if arg == target_idx:
            rank = len(argsort)-i
            ranks.append(rank)
            break
        if i == len(argsort)-1:
            print(y)
            print("element not available in dict!")
    
    

In [128]:
def calculate_rank(x_emb, y, fint_emb_all):
    cosine_scores = util.cos_sim(x_emb, fint_emb_all)[0]
    target_idx = pi_dict[y]
    argsort = np.argsort(cosine_scores)
    for i, arg in enumerate(argsort):
        if arg == target_idx:
            rank = len(argsort)-i
            return rank
        if i == len(argsort)-1:
            raise Exception()
    

In [129]:
fint_emb_all_np = fint_emb_all.detach().cpu().numpy()
fint_emb_x_np = fint_emb_x.detach().cpu().numpy()
bio_y_ = bio_y

In [132]:
%%time

from joblib import Parallel, delayed

fin_ranks = Parallel(n_jobs=30)(delayed(calculate_rank)(x_emb, y, fint_emb_all_np) for x_emb, y in zip(fint_emb_x_np, bio_y_))

In [139]:
len([r for r in fin_ranks if r < 1000]), len(fin_ranks)

In [140]:
len([r for r in res if r < 1000]), len(fin_ranks)

In [135]:
np.median(fin_ranks), np.median(res)

# build survey questions

## neighborhood score + negative sampling choices

In [48]:
import pandas as pd
from ast import literal_eval

df = pd.read_csv('twitter_pi_with_neighbors_tfidf.csv')
print(df.head())
df.positives = df.positives.apply(literal_eval)
df.negatives = df.negatives.apply(literal_eval)

In [49]:
from tqdm import tqdm
from random import randint

pis = df['pis']
positives = df['positives']
negatives = df['negatives']

sample_cnt = 500
questions = []
targets = []
other_choices = []

sample_idices = np.random.randint(0, len(df), size=sample_cnt)

for qid in sample_idices:
    q = pis[qid]
    cur_pos = positives[qid]
    cur_neg = negatives[qid]
    target = cur_pos[randint(0,len(cur_pos)-1)]
    targets.append(target)
    questions.append(q)
    other_choices.append(np.random.choice(cur_neg, size=3, replace=False))
    
res = pd.DataFrame({'question_pi': questions, 'ans_pi': targets, 'other_choices': other_choices})
res.to_csv('surrvey_tfidf_twitter.csv', index=False, header=True)

In [50]:
!head surrvey_tfidf_twitter.csv -n 100

## model based question generation

In [4]:
from sentence_transformers import SentenceTransformer, models, util

fint_model = SentenceTransformer('./models/miniLM-L6-finetuned/')

In [5]:
from tqdm import tqdm
from random import randint


all_pis = df['pis']
sims = df['similars']
sample_cnt = 500

questions = []
targets = []
other_choices = []

i = 0
while i < sample_cnt:
    idx = randint(0, len(all_pis)-1)
    questions.append(all_pis[idx])
    i += 1



In [6]:
fint_emb_x = fint_model.encode(questions, convert_to_tensor=True)
fint_emb_all = fint_model.encode(all_pis, convert_to_tensor=True)


In [8]:
import numpy as np

targets = []
other_choices = []

for x in tqdm(fint_emb_x):
    cosine_scores = util.cos_sim(x, fint_emb_all).detach().cpu().numpy()[0]
    argsort = np.argsort(cosine_scores)
    best_k = argsort[-6:-1]
    worst_k = argsort[:len(argsort)//2]
    
    target_idx = np.random.choice(best_k)
    targets.append(all_pis[target_idx])
    
    other_idxs = np.random.choice(worst_k, size=3, replace=False)
    other_choices.append([all_pis[x] for x in other_idxs])
    
    
res = pd.DataFrame({'question_pi': questions, 'ans_pi': targets, 'other_choices': other_choices})
res.to_csv('modelbased-selection.csv', index=False, header=True)
    

In [9]:
!head modelbased-selection.csv