# Loading twitter bios

In [1]:
import pickle
with open('twitter_bios.pkl', 'rb') as f:
    bios = pickle.load(f)
    
bios[100:110]

[['"', 'economia', 'team manager'],
 ['socilogo', 'hincha de', 'sanlorenzo'],
 ['twitch affiliate',
  'k-pop',
  'taylor swift',
  'disney',
  'she',
  'her',
  'stan account'],
 ['business owner', 'atheist', 'music', 'vinyl junkie', 'caffeine addict'],
 ['god', 'freedom', 'justice', 'si vis pacem', 'para bellum'],
 ['investigator', 'wife', 'broncos', 'cyclones'],
 ['actor',
  'traveller',
  'writer',
  'producer',
  'director',
  'grampa',
  'photographer',
  'actuallyautistic'],
 ['documentary filmmaker', 'audio producer', 'editor', 'columbiajourn'],
 ['proud', 'vegan', 'novaccinepassports', 'ffs'],
 ['levi', 'devote', 'eng']]

In [5]:
print(len(bios))

3997690


# Cleaning data

## phrase cleaning

In [16]:
# build a vocabulary of phrases

from collections import Counter

pi_cnt = Counter()
for bio in bios:
    pi_cnt.update(bio)

len(pi_cnt)

37945

In [18]:
pi_cnt.most_common(5)

[('she', 511401),
 ('her', 434333),
 ('he', 261794),
 ('him', 215057),
 ('they', 163877)]

In [19]:
pi_cnt.most_common(len(pi_cnt))[-5:]

[('mo paz', 2),
 ('fala tu', 2),
 ('no bio yet', 2),
 ('lovely content', 1),
 ('skip for now', 1)]

## cleaning each bio

In [58]:
from tqdm import tqdm

# phrases of lenght at least 2
# profiles with at least 2 phrases

def clean_pis(all_pis):
    result = []
    for pis in tqdm(all_pis):
        current_pi = set()
        for pi in pis:
            if len(pi) >= 2 and pi_cnt[pi] >= 10:
                current_pi.add(pi)
        if len(current_pi) > 1:
            result.append(list(current_pi))
            
    return result
            
cleaned_bios = clean_pis(bios)
print(len(cleaned_bios))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3997690/3997690 [00:13<00:00, 299336.17it/s]

3971632





# Contrastive learning

## generating positive negative samples

In [68]:
# from each person's bio I create at most K triplets
from random import randint
import numpy as np

def pair_in_list(current_pair, l):
    for pair in l:
        if current_pair[0] in pair and current_pair[1] in pair:
            return True
    return False

def generate_triplets(bios, k=3):
    samples = []
    for idx, bio in tqdm(enumerate(bios), total=len(bios)):
        iters = min(len(bio)-1, k)
        chosen_pis = []
        for i in range(iters):
            pos1, pos2 = np.random.choice(bio, size=2, replace=False)
            while pair_in_list([pos1,pos2], chosen_pis):
                pos1, pos2 = np.random.choice(bio, size=2, replace=False)
                #print(pos1, pos2, chosen_pis, len(bio), bio)
            chosen_pis.append([pos1, pos2])
            neg_idx = randint(0, len(bios)-1)
            neg_sample = np.random.choice(bios[neg_idx], size=1)[0]
            samples.append([pos1, pos2, neg_sample])
    return samples
            
triplets = generate_triplets(cleaned_bios, k=3)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3971632/3971632 [07:26<00:00, 8904.14it/s]


## save dataset

In [69]:
import pickle 

with open('triplets.pkl', 'wb') as f:
    pickle.dump(triplets, f)

In [73]:
from sklearn.model_selection import train_test_split

train_set, valid_set = train_test_split(triplets, test_size=0.3, shuffle=True)

print(f"train size: {len(train_set)}, test size: {len(valid_set)}")

train size: 7171899, test size: 3073672


In [79]:
!mkdir data

In [84]:
!mkdir models

In [82]:
import csv

# writing the data into the file
with open('data/valid.csv', 'w') as f:   
    write = csv.writer(f, delimiter='\t')
    write.writerows(valid_set)
    
with open('data/train.csv', 'w') as f:   
    write = csv.writer(f, delimiter='\t')
    write.writerows(train_set)


## load dataset

In [None]:
import pickle 

with open('triplets.pkl', 'rb') as f:
    triplets = pickle.load(f)

## finetuning

In [1]:

from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models
from torch.utils.data import DataLoader
from sentence_transformers.readers import TripletReader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime

import torch
import csv, os
import logging
import json

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")


print(torch.cuda.is_available())


model_name = 'bert-base-nli-stsb-mean-tokens'
num_epochs = 1



DEBUG:root:test


True


## Fine tuning on the samples

In [2]:

### Create a torch.DataLoader that passes training batch instances to our model
train_batch_size = 16
triplet_reader = TripletReader(
    './data',
    s1_col_idx=0, 
    s2_col_idx=1, 
    s3_col_idx=2, 
    delimiter='\t', 
    quoting=csv.QUOTE_MINIMAL, 
    has_header=False)


model = SentenceTransformer(model_name)
valid_data_file = 'valid.csv'
train_data_file = 'train.csv'
output_model_path = 'models'

print("Read Triplet train dataset")
train_reader = triplet_reader.get_examples(train_data_file, max_examples=0)


INFO:root:Load pretrained SentenceTransformer: bert-base-nli-stsb-mean-tokens
INFO:root:Did not find a '/' or '\' in the name. Assume to download model from server.
INFO:root:Load SentenceTransformer from folder: /nas/home/madani/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-stsb-mean-tokens.zip
Some weights of the model checkpoint at /nas/home/madani/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to b

Read Triplet train dataset


In [3]:
train_dataset = SentencesDataset(
    examples=train_reader,
    model=model,
    parallel_tokenization = True,
    max_processes = 10,
    chunk_size = 100000
)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, num_workers=10)


INFO:root:Start tokenization
INFO:root:Use multi-process tokenization with 10 processes
INFO:root:Num sentences: 7171899
INFO:root:Sentences 0 longer than max_seqence_length: 0
INFO:root:Sentences 1 longer than max_seqence_length: 0
INFO:root:Sentences 2 longer than max_seqence_length: 0


In [15]:
import pickle 

with open('train_dataset.pkl', 'wb') as f:
    pickle.dump(train_dataset, f)

In [None]:
import pickle 

with open('train_dataset.pkl', 'rb') as f:
    train_dataset = pickle.load(f)

In [8]:
train_loss = losses.TripletLoss(model=model)

print("Read Triplet dev dataset")
evaluator = TripletEvaluator.from_input_examples(
    triplet_reader.get_examples(valid_data_file, max_examples=0), name='dev')


warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data


Read Triplet dev dataset


In [12]:
!pip install pytorch==1.5.0

[31mERROR: Could not find a version that satisfies the requirement pytorch==1.5.0 (from versions: 0.1.2, 1.0.2)[0m
[31mERROR: No matching distribution found for pytorch==1.5.0[0m


In [13]:
!which python

/nas/home/madani/miniconda3/envs/gen/bin/python


In [10]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=output_model_path)

print('done')



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]



Iteration:   0%|          | 0/448244 [00:00<?, ?it/s]



ValueError: not enough values to unpack (expected 2, got 1)

# Word2vec embedding

In [None]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

print(len(sentences))

class Callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

monitor = Callback()
model = Word2Vec(sentences, vector_size=50, window=5, min_count=1,
                 negative=10, workers=30, epochs=50, callbacks=[monitor],
                 compute_loss=True)

model.save('./w2v.model')

In [7]:
model = Word2Vec.load("./w2v.model")

In [8]:
model.wv.most_similar('he', topn=15)

[('pronouns: he', 0.9228585958480835),
 ('pronouns he', 0.8569645881652832),
 ('cis he', 0.8101291656494141),
 ('pronouns are he', 0.7875774502754211),
 ('them or he', 0.7698878645896912),
 ('my pronouns are he', 0.7616384625434875),
 ('them he', 0.7570477724075317),
 ('21 he', 0.6931623220443726),
 ('18 he', 0.6897026300430298),
 ('24 he', 0.6832929849624634),
 ('blm he', 0.6671035289764404),
 ('19 he', 0.6558746695518494),
 ('22 he', 0.627196192741394),
 ('23 he', 0.6264557838439941),
 ('20 he', 0.6185081005096436)]