# Loading wikipedia bios

## loading raw identities

In [None]:
import pandas as pd
from ast import literal_eval


df = pd.read_csv('FinalDataFrame5.csv')
df.identities = df.identities.apply(literal_eval)
bios = list(df['identities'])

bios[:5]

In [None]:
# The rest is similar to twitter bios approach

# Loading twitter bios

## load all raw data

In [None]:
import pickle
with open('/user/smadani/navid/data/pis2020.pkl', 'rb') as f:
    bios = pickle.load(f)
    
bios[100:110]

# PI frequency and top PIs

In [None]:
from collections import Counter
cntr = Counter()

for bio in bios:
    cntr.update(bio)

print(len(bios))

In [None]:
import numpy as np

freqs = list(cntr.values())
print(f"percentile freq: {np.percentile(freqs, 90)} mean freq: {np.mean(freqs)}")

In [None]:
most_frequent_pis = {k:v for k,v in cntr.items() if v > 5}
len(most_frequent_pis)

In [None]:
import pandas as pd

df = pd.DataFrame({'pi': list(most_frequent_pis.keys()), 'cnt': list(most_frequent_pis.values())})
df = df.sort_values(by=['cnt'], ascending=False)

In [None]:
df.to_csv('wiki_most_frequent_pis.csv', index=False, header=True)

### most frequent neighbors

In [None]:
from tqdm import tqdm
neighbor_cnt = {}

for bio in tqdm(bios):
    for pi in bio:
        if pi in most_frequent_pis:
            if pi not in neighbor_cnt:
                neighbor_cnt[pi] = Counter()
                
            rest = [b for b in bio if b!=pi and b in most_frequent_pis]
            neighbor_cnt[pi].update(rest)

print(len(neighbor_cnt))            

#post processing and pruning empty adjacencies

for pi, adj in neighbor_cnt.copy().items():
    if len(adj) < 2:
        neighbor_cnt.pop(pi)
    
print(f"size after pruning: {len(neighbor_cnt)}")        

### calculating tf-idf

In [None]:
for pi, neighs in tqdm(neighbor_cnt.items()):
    for phrase in neighs.keys():
        neighs[phrase] /= most_frequent_pis[phrase]
        

### calculating using bi-partite method

In [None]:
from scipy import sparse, io

pi_idx = {}
for pi in most_frequent_pis:
    pi_idx[pi] = len(pi_idx)

#creating bipartite matrix
usr_pi = []

for bio in bios:
    cur_usr_pis = []
    for pi in bio:
        if pi in most_frequent_pis:
            cur_usr_pis.append(pi_idx[pi])
    if len(cur_usr_pis) < 2:
        continue
    usr_pi.append(cur_usr_pis)
    

print(f"original users: {len(bios)}")
print(f"no of users after pruning: {len(usr_pi)}")


In [None]:
usrs = []
pis = []
scores = []

for uid, pis in enumerate(usr_pi):
    for pi in pis:
        usrs.append(uid)
        pis.append(pi)
        scores.append(1)

In [None]:
%% time

BP_MATRIX_FILENAME = "./bipartite_pi.mtx"
output_matrix = sparse.coo_matrix((scores, (usrs, pis)))
io.mmwrite(BP_MATRIX_FILENAME, output_matrix)

In [None]:
!du -hs ./bipartite_pi.mtx

In [None]:
from scipy import io

io.mmread(BP_MATRIX_FILENAME, output_matrix)

In [None]:
scores

In [None]:
!rm ./bipartite_pi.mtx.gz
!gzip ./bipartite_pi.mtx
!ls

In [None]:
import sys

sys.path.append("../bipartite-pairs/python-scoring/")
import score_data

BP_SCORING_OUTPUT = './bipartite_output.csv.gz'
score_data.score_only(
    BP_MATRIX_FILENAME+".gz",
    ['weighted_corr_exp'],
    BP_SCORING_OUTPUT,
)

In [None]:
!zcat ./bipartite_output.csv.gz

In [None]:
import pandas as pd

BP_SCORING_OUTPUT = './bipartite_output.csv.gz'
df = pd.read_csv(BP_SCORING_OUTPUT)
df.head()

In [None]:
# recreate the neighboring dictionary



### save 

In [None]:
import pandas as pd

pis = []
positives = []
negatives = []

for pi, cntr in tqdm(neighbor_cnt.items()):
    cur_neg = [x for x in most_frequent_pis if x not in neighbor_cnt[pi]]
    if len(cur_neg) > 20:
        cur_neg = list(np.random.choice(cur_neg, size=20, replace=False))
    cur_pos = [x[0] for x in neighbor_cnt[pi].most_common(5)]
    if len(cur_neg) < 4 or len(neighbor_cnt[pi])<2:
        print(f"PASSING PI: {pi}")
        continue
    positives.append(cur_pos)
    pis.append(pi)
    negatives.append(cur_neg)
                 
print(f"saving {len(pis)} pis")
df = pd.DataFrame({'pis': pis, 'positives': positives, 'negatives': negatives})
df.to_csv('twitter_pi_with_neighbors_tfidf.csv', index=False, header=True)

In [None]:
!wc -l wiki_pi_with_neighbors_standard.csv

In [None]:
!head wiki_pi_with_neighbors_standard.csv

## create test dataset

In [None]:
from sklearn.model_selection import train_test_split

lower_bios = []
for bio in bios:
    lower_bio = []
    for pi in bio:
        lower_bio.append(pi.lower())
    lower_bios.append(lower_bio)
    
train, test = train_test_split(bios, test_size=0.2, shuffle=True)
print(len(train), len(test))



In [None]:
import pickle

with open('./data/wiki_test_bios.pkl', 'wb') as f:
    pickle.dump(test, f)
    
with open('./data/wiki_train_bios.pkl', 'wb') as f:
    pickle.dump(train, f)

# Cleaning data

## phrase cleaning

In [None]:
import pickle

with open('./data/wiki_train_bios.pkl', 'rb') as f:
    bios = pickle.load(f)

In [None]:
# build a vocabulary of phrases
from tqdm import tqdm_notebook as tqdm
from collections import Counter

pi_cnt = Counter()
for bio in tqdm(bios):
    pi_cnt.update(bio)

len(pi_cnt)

In [None]:
from tqdm import tqdm
neighbor_cnt = {}

for bio in tqdm(bios):
    for pi in bio:
        if pi in pi_cnt:
            if pi not in neighbor_cnt:
                neighbor_cnt[pi] = Counter()
                
            rest = [b for b in bio if b!=pi and b in pi_cnt]
            neighbor_cnt[pi].update(rest)

print(len(neighbor_cnt))            


In [None]:
pi_cnt.most_common(10)

In [None]:
pi_cnt.most_common(len(pi_cnt))[-5:]

## cleaning each bio

In [None]:
from tqdm import tqdm

# phrases of lenght at least l
# profiles with at least k phrases
# pis that's been repeated at least m times in dataset

def clean_pis(all_pis):
    result = []
    for pis in tqdm(all_pis):
        current_pi = set()
        for pi in pis:
            if len(pi) >= 2 and pi_cnt[pi] >= 1:
                current_pi.add(pi)
        if len(current_pi) > 1:
            result.append(list(current_pi))
            
    return result
            
cleaned_bios = clean_pis(bios)
print(len(cleaned_bios), len(bios))

# Contrastive learning

## generating positive negative samples

In [None]:
import seaborn as sns

sns.distplot([len(b) for b in bios])

In [None]:
from random import randint
import numpy as np

def pair_in_list(current_pair, l):
    for pair in l:
        if current_pair[0] in pair and current_pair[1] in pair:
            return True
    return False


def generate_triplets(bios, k=3):
    samples = []
    pi_set = list(pi_cnt.keys())
    for idx, bio in tqdm(enumerate(bios), total=len(bios)):
        iters = min(len(bio)-1, k)
        chosen_pis = []
        for i in range(iters):
            pos1, pos2 = np.random.choice(bio, size=2, replace=False)
            while pair_in_list([pos1,pos2], chosen_pis):
                pos1, pos2 = np.random.choice(bio, size=2, replace=False)
            chosen_pis.append([pos1, pos2])
            neg_idx = randint(0, len(pi_set)-1)
            while pi_set[neg_idx] in neighbor_cnt[pos1] or pi_set[neg_idx] in neighbor_cnt[pos2]:
                neg_idx = randint(0, len(pi_set)-1)
            samples.append([pos1, pos2, pi_set[neg_idx]])
    return samples

triplets = generate_triplets(cleaned_bios, k=3)

In [None]:
print(len(triplets))

## save dataset

In [None]:
import pickle 

with open('data/triplets.pkl', 'wb') as f:
    pickle.dump(triplets, f)

In [None]:
import pickle 

with open('data/triplets.pkl', 'rb') as f:
    triplets = pickle.load(f)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(triplets, test_size=0.01, shuffle=True)
train_set, valid_set = train_test_split(train_set, test_size=0.01, shuffle=True)

print(f"train size: {len(train_set)}, validation size: {len(valid_set)}, test size: {len(test_set)}")

In [None]:
import csv

# writing the data into the file
with open('data/valid.csv', 'w') as f:   
    write = csv.writer(f, delimiter='\t')
    write.writerows(valid_set)
    
with open('data/train.csv', 'w') as f:   
    write = csv.writer(f, delimiter='\t')
    write.writerows(train_set)

with open('data/test.pckl', 'wb') as f:   
    pickle.dump(test_set, f)

## finetuning

### building the dataset


In [35]:
import csv
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from torch.utils.data import DataLoader

with open('data/train.csv', newline='') as f:
    train_examples = []
    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in tqdm(reader):
        train_examples.append(InputExample(texts=[row[0], row[1]], label=1.0))
        train_examples.append(InputExample(texts=[row[0], row[2]], label=0.0))
    

381450it [00:02, 133765.67it/s]


### loading evaluation dataset

In [36]:

from sentence_transformers import evaluation

with open('data/valid.csv', newline='') as f:
    sent1s = []
    sent2s = []
    scores = []
    i = 0
    reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in tqdm(reader):
        sent1s.append(row[0])
        sent1s.append(row[0])
        sent2s.append(row[1])
        sent2s.append(row[2])
        scores.append(1.0)
        scores.append(0.0)
        i += 1
evaluator = evaluation.EmbeddingSimilarityEvaluator(sent1s, sent2s, scores)


3854it [00:00, 518120.70it/s]


### creating data loaders

In [38]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
# dense_model = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=100, activation_function=nn.Tanh())
# model.add_module('3', dense_model)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=128)
train_loss = losses.CosineSimilarityLoss(model)

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### training

In [None]:
output_model_path = 'models/miniLM-L6-finetuned-wiki2'

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=5,
          evaluation_steps=2500,
          warmup_steps=5000,
          output_path=output_model_path)



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5961 [00:00<?, ?it/s]

## evaluation

### loading saved model

In [None]:
%env CUDA_VISIBLE_DEVICES=1

In [None]:
from sentence_transformers import SentenceTransformer, models

model = SentenceTransformer('./models/miniLM-L6-finetuned-wiki/')

model

### calculating encodings for all phrases

In [None]:
pis = set()

for bio in cleaned_bios:
    pis.update(bio)

pis = list(pis)
print(len(pis))

embeddings = model.encode(pis, convert_to_tensor=True)
        

In [None]:
def most_similar(pi, all_pis, all_pi_embs, model, k=11):
    cur_emb = model.encode(pi, convert_to_tensor=True)
    cosine_scores = util.cos_sim(cur_emb, all_pi_embs).detach().cpu().numpy()[0]
    most_similars = np.argsort(cosine_scores)[-k:]
    return [(all_pis[i], cosine_scores[i]) for i in most_similars if pi!=all_pis[i]]

most_similar('mima', pis, embeddings, model, k=50)

In [None]:
def get_similarity(w1, w2, model=model):
    emb1 = model.encode(w1, convert_to_tensor=True)
    emb2 = model.encode(w2, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2)

print(
    get_similarity('isfj', 'man'),
    get_similarity('isfj', 'woman'),
    get_similarity('isfj', 'man', model=model),
    get_similarity('isfj', 'woman', model=model),
)

print(
    get_similarity('intj', 'man'),
    get_similarity('intj', 'woman'),
    get_similarity('intj', 'man', model=model),
    get_similarity('intj', 'woman', model=model),
)

print(
    get_similarity('entj', 'man'),
    get_similarity('entj', 'woman'),
    get_similarity('entj', 'man', model=model),
    get_similarity('entj', 'woman', model=model),
)


In [None]:
mom_emb = base_model.encode('intp', convert_to_tensor=True)
dad_emb = base_model.encode('esfj', convert_to_tensor=True)
util.cos_sim(mom_emb, dad_emb)

### loading not tuned model and doing the same thing

In [None]:
from sentence_transformers import SentenceTransformer, models

base_model = SentenceTransformer('all-MiniLM-L6-v2')
base_embs = base_model.encode(pis, convert_to_tensor=True)
base_cosine_scores = util.cos_sim(base_embs, base_embs).detach().cpu().numpy()

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

target_word = 'build the wall'
df1 = pd.DataFrame(most_similar(target_word, pis, embeddings, model), columns=['identifier', 'similarity'])
df1['model'] = 'fine-tuned-sentence-bert'
df2 = pd.DataFrame(most_similar(target_word, pis, base_embs, base_model), columns=['identifier', 'similarity'])
df2['model'] = 'original-sentence-bert'


fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,6))

ax1.scatter(x=df1['identifier'], y=df1['similarity'])
ax1.tick_params(axis='x', rotation=-60)
ax1.set_xlabel('phrase')
ax1.set_ylabel('similarity')
ax1.set_title('fine-tuned-sentence-bert')


ax2.scatter(x=df2['identifier'], y=df2['similarity'])
ax2.tick_params(axis='x', rotation=-60)
ax2.set_xlabel('phrase')
ax2.set_ylabel('similarity')
ax2.set_title('original-sentence-bert')

plt.show()

### analyzing personalities

In [None]:
for personality in ['ESTJ', 'ENTJ', 'ESFJ', 'ENFJ', 'ISTJ', 'ISFJ', 'INTJ', 'INFJ', 'ESTP', 'ESFP', 'ENTP', 'ENFP', 'ISTP', 'ISFP', 'INTP', 'INFP']:
    if personality in pis or personality.lower() in pis:
        print(f"{personality}: True")
    else:
        print(f"{personality}: False")

In [None]:
personalities = ['ESTJ', 'ENTJ', 'ESFJ', 'ENFJ', 'ISTJ', 'ISFJ', 'INTJ', 'INFJ', 'ESTP', 'ESFP', 'ENTP', 'ENFP', 'ISTP', 'ISFP', 'INTP', 'INFP']
personalities = [p.lower() for p in personalities]

pers_emb = model.encode(personalities, convert_to_tensor=True)
pers_emb_base = base_model.encode(personalities, convert_to_tensor=True)

base_cosine_scores = util.cos_sim(pers_emb_base, pers_emb_base).detach().cpu().numpy()
cosine_scores = util.cos_sim(pers_emb, pers_emb).detach().cpu().numpy()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.heatmap(cosine_scores)
ax.set_xticklabels(personalities, rotation=90)
ax.set_yticklabels(personalities, rotation=0)

# plt.xticks(ticks=personalities)
# plt.yticks(ticks=personalities)

In [None]:
plt.figure(figsize=(10,10))

ax = sns.heatmap(base_cosine_scores)
ax.set_xticklabels(personalities, rotation=90)
ax.set_yticklabels(personalities, rotation=0)

### comparing in gensim vocab

In [None]:
import gensim.downloader as api

w2v = api.load("glove-wiki-gigaword-50")
w2v.most_similar("glass")

In [None]:
vocab = list(w2v.key_to_index.keys())

vocab_embs = model.encode(vocab, convert_to_tensor=True)


In [None]:
most_similar('vaccine', vocab, vocab_embs, model, k=10)

In [None]:
most_similar('blm', vocab, vocab_embs, model, k=50)

# Word2vec embedding

In [None]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

import pickle

with open('/user/smadani/navid/data/pis2020.pkl', 'rb') as f:
    bios = pickle.load(f)
    
bios[100:110]

In [None]:

print(len(bios))

class Callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

monitor = Callback()
model = Word2Vec(bios, vector_size=256, window=5, min_count=1,
                 negative=10, workers=30, epochs=100, callbacks=[monitor],
                 compute_loss=True)

model.save('./models/w2v.model')

In [None]:
model = Word2Vec.load("./models/w2v.model")

In [None]:
model.wv.most_similar('he', topn=15)

# Downstream tasks

## hold-one-out prediction of PIs

### clean data

In [1]:
import pickle

with open('data/wiki_test_bios.pkl', 'rb') as f:
    test_bios = pickle.load(f)

with open('data/wiki_train_bios.pkl', 'rb') as f:
    train_bios = pickle.load(f)
    
all_bios = train_bios + test_bios
print(len(all_bios))

985429


In [2]:
# build a vocabulary of phrases
from tqdm import tqdm_notebook as tqdm
from collections import Counter

pi_cnt = Counter()
for bio in tqdm(all_bios):
    pi_cnt.update(bio)

len(pi_cnt)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for bio in tqdm(all_bios):


  0%|          | 0/985429 [00:00<?, ?it/s]

55009

In [3]:

# phrases of lenght at least 2
# profiles with at least 2 phrases
# pis that's been repeated at least 10 times in dataset

def clean_pis(all_pis):
    result = []
    for pis in tqdm(all_pis):
        current_pi = set()
        for pi in pis:
            if len(pi) >= 2 and pi_cnt[pi] >= 1:
                current_pi.add(pi)
        if len(current_pi) > 1:
            result.append(list(current_pi))
            
    return result
            
cleaned_all_bios = clean_pis(all_bios)
cleaned_test_bios = clean_pis(test_bios)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for pis in tqdm(all_pis):


  0%|          | 0/985429 [00:00<?, ?it/s]

  0%|          | 0/197086 [00:00<?, ?it/s]

In [4]:
cleaned_all_bios[:3]

[['actress', 'bondage model'],
 ['volleyball player', 'gold medalist'],
 ['singer', 'guitarist']]

### create dataset

In [5]:
import numpy as np

test_ds = []


for bio in cleaned_test_bios:
    hold_out_idx = np.random.randint(0, len(bio))
    remaining = [x for i, x in enumerate(bio) if i != hold_out_idx]
    remaining = ', '.join(remaining)
    target = bio[hold_out_idx]
    
    test_ds.append((remaining, target))

print(len(test_ds))

62250


In [6]:
all_pis = set()
for bio in cleaned_all_bios:
    for pi in bio:
        all_pis.add(pi)

all_pis = list(all_pis)

In [7]:
print(len(all_pis))

46189


In [8]:
from collections import OrderedDict

pi_dict = OrderedDict()
for p in all_pis:
    pi_dict[p] = len(pi_dict)

In [9]:
print(len(pi_dict))

46189


In [10]:
bio_x, bio_y = zip(*test_ds)
print(len(bio_y), len(bio_x))

62250 62250


In [14]:
from tqdm import tqdm
import torch 

def get_results_batched(model, tokenizer, str_l, bs=256, average_k_layers=1):
    i = 0
    result = []
    pbar = tqdm(total=len(str_l))
    while i < len(str_l):
        batch = list(str_l[i:i+bs])
        with torch.no_grad():
            tokens = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            res_full = model(**tokens).hidden_states
            layers = []

            for k in range(-average_k_layers,0):
                pooled_val = res_full[k]
                # taking cls token embeddings
                layers.append(pooled_val[:,0,:])

            stacked_layers = torch.stack(layers, dim=1)
            #print(stacked_layers.shape)

            average_embs = torch.mean(stacked_layers, dim=1)
            #print(average_embs.shape)

            result.append(average_embs.detach().cpu())
            i = i + bs
            pbar.update(bs)
    return torch.concat(result, dim=0)


### load original bert and embeddings

In [15]:
%%time

from transformers import BertTokenizer, BertModel, BertConfig
import torch

device = 'cuda:1'

config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased", config=config)
bert_model = bert_model.to(device)
bert_model.eval()


bert_emb_x = get_results_batched(bert_model, bert_tokenizer, bio_x, average_k_layers=3)
bert_emb_all = get_results_batched(bert_model, bert_tokenizer, all_pis, average_k_layers=3)

print(bert_emb_x.shape, bert_emb_all.shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/62250 [00:00<?, ?it/s][A
  1%|          | 512/62250 [00:00<00:15, 4094.50it/s][A
  2%|▏         | 1024/62250 [00:00<00:15, 3982.84it/

 35%|███▍      | 16128/46189 [00:02<00:04, 6149.22it/s][A
 37%|███▋      | 16896/46189 [00:02<00:04, 6117.35it/s][A
 38%|███▊      | 17664/46189 [00:02<00:04, 6066.24it/s][A
 40%|███▉      | 18432/46189 [00:02<00:04, 6049.76it/s][A
 42%|████▏     | 19200/46189 [00:03<00:04, 6062.07it/s][A
 43%|████▎     | 19968/46189 [00:03<00:04, 6047.83it/s][A
 45%|████▍     | 20736/46189 [00:03<00:04, 6118.20it/s][A
 47%|████▋     | 21504/46189 [00:03<00:04, 6091.20it/s][A
 48%|████▊     | 22272/46189 [00:03<00:03, 6028.85it/s][A
 50%|████▉     | 23040/46189 [00:03<00:03, 6087.49it/s][A
 52%|█████▏    | 23808/46189 [00:03<00:03, 6040.97it/s][A
 53%|█████▎    | 24576/46189 [00:03<00:03, 6045.94it/s][A
 55%|█████▍    | 25344/46189 [00:04<00:03, 5988.44it/s][A
 57%|█████▋    | 26112/46189 [00:04<00:03, 5994.74it/s][A
 58%|█████▊    | 26880/46189 [00:04<00:03, 6008.05it/s][A
 60%|█████▉    | 27648/46189 [00:04<00:03, 5770.18it/s][A
 62%|██████▏   | 28416/46189 [00:04<00:03, 5824.28it/s]

torch.Size([62250, 768]) torch.Size([46189, 768])
CPU times: user 20.3 s, sys: 1.92 s, total: 22.2 s
Wall time: 31.8 s





### calculate rank score

In [16]:
%%time
import torch
from sentence_transformers import SentenceTransformer, models, util

device = 'cpu'
cosine_scores = util.cos_sim(bert_emb_x.to(device), bert_emb_all.to(device))
ranks = torch.argsort(torch.argsort(cosine_scores, dim=1, descending=True), dim=1)
target_idxs = torch.tensor([pi_dict[y] for y in bio_y], dtype=torch.int64)
print(target_idxs.shape)
target_ranks = torch.gather(ranks, 1, target_idxs.unsqueeze(1).reshape(-1,1)).type(torch.FloatTensor)




torch.Size([62250])
CPU times: user 9min 17s, sys: 33.2 s, total: 9min 50s
Wall time: 4min 19s


In [17]:
len([r for r in target_ranks.squeeze().tolist() if r < 100]), torch.mean(target_ranks)

(2072, tensor(14324.7344))

### load original sentence bert and embeddings

In [18]:
%%time

from sentence_transformers import SentenceTransformer, models, util
from transformers import AutoTokenizer, AutoModel

device = 'cuda:1'
# orig_model = SentenceTransformer('all-MiniLM-L6-v2')
# orig_model = orig_model.to(device)

#orig_emb_x = orig_model.encode(bio_x, `convert_to_tensor=True)
#orig_emb_all = orig_model.encode(all_pis, convert_to_tensor=True)

sbert_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
sbert_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', output_hidden_states=True)
sbert_model = sbert_model.to(device)
sbert_model.eval()

sbert_emb_x = get_results_batched(sbert_model, sbert_tokenizer, bio_x, bs=256, average_k_layers=3)
sbert_emb_all = get_results_batched(sbert_model, sbert_tokenizer, all_pis, bs=256, average_k_layers=3)



  0%|          | 0/62250 [00:00<?, ?it/s][A
  3%|▎         | 2048/62250 [00:00<00:03, 18965.64it/s][A
  7%|▋         | 4096/62250 [00:00<00:03, 18974.41it/s][A
 11%|█         | 6656/62250 [00:00<00:02, 21869.84it/s][A
 16%|█▌        | 9728/62250 [00:00<00:02, 24447.43it/s][A
 20%|██        | 12544/62250 [00:00<00:01, 25709.87it/s][A
 25%|██▌       | 15616/62250 [00:00<00:01, 26951.42it/s][A
 30%|███       | 18688/62250 [00:00<00:01, 27497.28it/s][A
 35%|███▍      | 21504/62250 [00:00<00:01, 27238.28it/s][A
 39%|███▉      | 24576/62250 [00:00<00:01, 27515.59it/s][A
 44%|████▍     | 27648/62250 [00:01<00:01, 27983.68it/s][A
 49%|████▉     | 30464/62250 [00:01<00:01, 27811.36it/s][A
 54%|█████▍    | 33536/62250 [00:01<00:01, 28409.32it/s][A
 59%|█████▉    | 36608/62250 [00:01<00:00, 28282.45it/s][A
 64%|██████▎   | 39680/62250 [00:01<00:00, 27983.13it/s][A
 68%|██████▊   | 42496/62250 [00:01<00:00, 27957.07it/s][A
 73%|███████▎  | 45568/62250 [00:01<00:00, 28177.46it/s][

CPU times: user 8.63 s, sys: 3.18 s, total: 11.8 s
Wall time: 5.07 s





### calculate rank score

In [21]:
%%time
import torch

device='cpu'
cosine_scores = util.cos_sim(sbert_emb_x, sbert_emb_all)
ranks = torch.argsort(torch.argsort(cosine_scores.to(device), dim=1, descending=True), dim=1)
target_idxs = torch.tensor([pi_dict[y] for y in bio_y], dtype=torch.int64)
target_ranks = torch.gather(ranks, 1, target_idxs.unsqueeze(1).reshape(-1,1)).type(torch.FloatTensor)


CPU times: user 9min 1s, sys: 27.5 s, total: 9min 28s
Wall time: 4min 15s


In [22]:
len([r for r in target_ranks.squeeze().tolist() if r < 100]), torch.mean(target_ranks)

(4406, tensor(7061.2729))

### load finetuned sentence bert and embeddings

In [30]:
%%time

from sentence_transformers import SentenceTransformer, models, util
from transformers import AutoTokenizer, AutoModel

device = 'cpu'

fint_tokenizer = AutoTokenizer.from_pretrained('./models/miniLM-L6-finetuned-wiki/')
fint_model = AutoModel.from_pretrained('./models/miniLM-L6-finetuned-wiki/', output_hidden_states=True)
fint_model = fint_model.to(device)
fint_model.eval()

fint_emb_x = get_results_batched(fint_model, fint_tokenizer, bio_x, bs=256, average_k_layers=3)
fint_emb_all = get_results_batched(fint_model, fint_tokenizer, all_pis, bs=256, average_k_layers=3)

CPU times: user 467 ms, sys: 34.4 ms, total: 502 ms
Wall time: 232 ms


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

### calculate rank score

In [24]:
%%time
import torch

device='cpu'
cosine_scores_fint = util.cos_sim(fint_emb_x, fint_emb_all)
ranks_fint = torch.argsort(torch.argsort(cosine_scores_fint.to(device), dim=1, descending=True), dim=1)
target_idxs = torch.tensor([pi_dict[y] for y in bio_y], dtype=torch.int64)
target_ranks_fint = torch.gather(ranks_fint, 1, target_idxs.unsqueeze(1).reshape(-1,1)).type(torch.FloatTensor)


CPU times: user 8min 59s, sys: 27.4 s, total: 9min 26s
Wall time: 4min 15s


In [25]:
len([r for r in target_ranks_fint.reshape(1,-1).squeeze().tolist() if r < 100]), torch.mean(target_ranks_fint)

(17930, tensor(8684.3525))

# build survey questions

## neighborhood score + negative sampling choices

In [None]:
import pandas as pd
from ast import literal_eval

df = pd.read_csv('twitter_pi_with_neighbors_tfidf.csv')
print(df.head())
df.positives = df.positives.apply(literal_eval)
df.negatives = df.negatives.apply(literal_eval)

In [None]:
from tqdm import tqdm
from random import randint

pis = df['pis']
positives = df['positives']
negatives = df['negatives']

sample_cnt = 500
questions = []
targets = []
other_choices = []

sample_idices = np.random.randint(0, len(df), size=sample_cnt)

for qid in sample_idices:
    q = pis[qid]
    cur_pos = positives[qid]
    cur_neg = negatives[qid]
    target = cur_pos[randint(0,len(cur_pos)-1)]
    targets.append(target)
    questions.append(q)
    other_choices.append(np.random.choice(cur_neg, size=3, replace=False))
    
res = pd.DataFrame({'question_pi': questions, 'ans_pi': targets, 'other_choices': other_choices})
res.to_csv('surrvey_tfidf_twitter.csv', index=False, header=True)

In [None]:
!head surrvey_tfidf_twitter.csv -n 100

## model based question generation

In [None]:
from sentence_transformers import SentenceTransformer, models, util

fint_model = SentenceTransformer('./models/miniLM-L6-finetuned/')

In [None]:
from tqdm import tqdm
from random import randint


all_pis = df['pis']
sims = df['similars']
sample_cnt = 500

questions = []
targets = []
other_choices = []

i = 0
while i < sample_cnt:
    idx = randint(0, len(all_pis)-1)
    questions.append(all_pis[idx])
    i += 1



In [None]:
fint_emb_x = fint_model.encode(questions, convert_to_tensor=True)
fint_emb_all = fint_model.encode(all_pis, convert_to_tensor=True)


In [None]:
import numpy as np

targets = []
other_choices = []

for x in tqdm(fint_emb_x):
    cosine_scores = util.cos_sim(x, fint_emb_all).detach().cpu().numpy()[0]
    argsort = np.argsort(cosine_scores)
    best_k = argsort[-6:-1]
    worst_k = argsort[:len(argsort)//2]
    
    target_idx = np.random.choice(best_k)
    targets.append(all_pis[target_idx])
    
    other_idxs = np.random.choice(worst_k, size=3, replace=False)
    other_choices.append([all_pis[x] for x in other_idxs])
    
    
res = pd.DataFrame({'question_pi': questions, 'ans_pi': targets, 'other_choices': other_choices})
res.to_csv('modelbased-selection.csv', index=False, header=True)
    

In [None]:
!head modelbased-selection.csv