In [None]:
# installing huggingface transformer for BERT
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 70.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [None]:
# relevant packages needed for BERT embedding generation
import numpy as np
import pandas as pd
import torch
import transformers

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
# load BERT tokenizer and model
model_class, tokenizer_class, name = (transformers.BertModel, transformers.BertTokenizer, 'bert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(name)
model = model_class.from_pretrained(name)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# load dataset
# CHOOSE THE GLUE CSV IN DRIVE

# Method 1: If you're running this runtime for the first time (default)
from google.colab import files
uploaded = files.upload()  # upload the quora csv here, might take ~10 minutes 
import io
df = pd.read_csv(io.BytesIO(uploaded['GLUE-Quora.csv'])) # REPLACE WITH YOUR CSV NAME

# Quora dataset is now stored in a Pandas Dataframe

# Method 2: You can use this if you're re-running the runtime
#df=pd.read_csv('/content/GLUE-Quora.csv')

Saving GLUE-Quora.csv to GLUE-Quora.csv


In [None]:
# peek the first few rows of your dataframe
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [None]:
# check on the distribution of labels (duplicates vs non-duplicates) for general understanding
# runs for about 20s
duplicateCount = 0
notDuplicateCount = 0
for index, row in df.iterrows():
  if row["is_duplicate"] == 1:
    duplicateCount += 1
  else:
    notDuplicateCount += 1
  
print("duplicate: " + str(duplicateCount))
print("not duplicate: " + str(notDuplicateCount))

duplicate: 134378
not duplicate: 229468


In [None]:
# tokenize the dataframe
df["question1"] = df["question1"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))  
df["question2"] = df["question2"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
df.head()  # as you can see, the question prompts have been tokenized to int id

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,"[101, 2129, 2003, 1996, 2166, 1997, 1037, 8785...","[101, 2029, 2504, 1997, 17463, 8156, 2003, 243...",0
1,402555,536040,536041,"[101, 2129, 2079, 1045, 2491, 2026, 7109, 2100...","[101, 2129, 2079, 2017, 2491, 2115, 7109, 9961...",1
2,360472,364011,490273,"[101, 2054, 5320, 14708, 3609, 2000, 2689, 200...","[101, 2054, 2064, 3426, 14708, 2000, 2272, 204...",0
3,150662,155721,7256,"[101, 2054, 2064, 2028, 2079, 2044, 16914, 591...","[101, 2054, 2079, 1045, 2079, 2044, 2026, 1691...",1
4,183004,279958,279959,"[101, 2073, 2064, 1045, 2424, 1037, 2373, 1330...","[101, 2052, 1037, 2117, 3199, 1999, 3994, 1010...",0


In [None]:
# pad the tokenized input so that every sentence has the same size
max_len = 0
for index, row in df.iterrows():
  i = row["question1"]
  if len(i) > max_len:
    max_len = len(i)
  
  j = row["question2"]
  if len(j) > max_len:
    max_len = len(j)

print("max length of input: " + str(max_len)) # should return 286. for BERT to work we just need to keep it under 512

for index, row in df.iterrows():
  row["question1"] = row["question1"].extend([0]*(max_len - len(row["question1"])))
  row["question2"] = row["question2"].extend([0]*(max_len - len(row["question2"])))

max length of input: 286


In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,"[101, 2129, 2003, 1996, 2166, 1997, 1037, 8785...","[101, 2029, 2504, 1997, 17463, 8156, 2003, 243...",0
1,402555,536040,536041,"[101, 2129, 2079, 1045, 2491, 2026, 7109, 2100...","[101, 2129, 2079, 2017, 2491, 2115, 7109, 9961...",1
2,360472,364011,490273,"[101, 2054, 5320, 14708, 3609, 2000, 2689, 200...","[101, 2054, 2064, 3426, 14708, 2000, 2272, 204...",0
3,150662,155721,7256,"[101, 2054, 2064, 2028, 2079, 2044, 16914, 591...","[101, 2054, 2079, 1045, 2079, 2044, 2026, 1691...",1
4,183004,279958,279959,"[101, 2073, 2064, 1045, 2424, 1037, 2373, 1330...","[101, 2052, 1037, 2117, 3199, 1999, 3994, 1010...",0


In [None]:
q1 = df[["question1"]].to_numpy()
q2 = df[["question2"]].to_numpy()
for i in range(len(q1)):
  q1[i] = np.array(q1[i])
  q2[i] = np.array(q2[i])

In [None]:
q1arr = np.empty((len(q1), max_len))
for i in range(len(q1)):
  q1arr[i] = np.array(q1[i][0])

q2arr = np.empty((len(q2), max_len))
for i in range(len(q2)):
  q2arr[i] = np.array(q2[i][0])

In [None]:
q1arr.shape

(363846, 286)

In [None]:
# generate mask
attention_mask1 = np.where(q1arr != 0, 1, 0)
attention_mask2 = np.where(q2arr != 0, 1, 0)
attention_mask1.shape

(363846, 286)

In [None]:
from datetime import datetime

In [None]:
start = 0 # MODIFY THIS
inc = 1
end = start + inc

res1 = np.empty((0,768), np.float32)
res2 = np.empty((0,768), np.float32)

# total about 363850, would be a lot faster if we all run separate parts and put together

### things to take note and/or edit are in capital letters
# ONCE IT REACHES THIS CELL AND YOU CAN JUST LEAVE IT TO RUN, ONLY NEED TO COME BACK AND DOWNLOAD RESULT ONCE ITS DONE
# 0-60000: XZ
# 60001-120000: Desmond
# 120001-180000: Gordon
# 180001-240000: Li Xue
# 240001-300000: Chi Sern
# 300000-363850: Wee Han
while (end < 5000):  # UPDATE THIS ACCORDINGLY (e.g. while (60001<end<65000))
  if (end%100 == 0):
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S") # in UTC for reference
    print("Current Time =", current_time)
    print("running the " + str(end / inc) + "-th loop")

  q1part = q1arr[start : end]
  q2part = q2arr[start : end]
  attentionmask1part = attention_mask1[start : end]
  attentionmask2part = attention_mask2[start : end]

  # generate embedding
  input1 = torch.tensor(q1part).to(torch.int64)
  input2 = torch.tensor(q2part).to(torch.int64)
  attention_masktensor1 = torch.tensor(attentionmask1part).to(torch.int64)
  attention_masktensor2 = torch.tensor(attentionmask2part).to(torch.int64)

  # the following code is computationally expensive
  with torch.no_grad(): # disable gradient calculation, ensure there is no test leak
    last_hidden_states1 = model(input1, attention_mask = attention_masktensor1)
    last_hidden_states2 = model(input2, attention_mask = attention_masktensor2)

  features1 = last_hidden_states1[0][:,0,:].numpy()
  res1 = np.append(res1, features1, axis=0)
  features2 = last_hidden_states2[0][:,0,:].numpy()
  res2 = np.append(res2, features2, axis=0)

  start += inc
  end += inc

Current Time = 03:37:16
running the 100.0-th loop
Current Time = 03:41:24
running the 200.0-th loop
Current Time = 03:45:30
running the 300.0-th loop
Current Time = 03:49:36
running the 400.0-th loop
Current Time = 03:53:42
running the 500.0-th loop
Current Time = 03:57:48
running the 600.0-th loop
Current Time = 04:01:55
running the 700.0-th loop
Current Time = 04:06:02
running the 800.0-th loop
Current Time = 04:10:09
running the 900.0-th loop
Current Time = 04:14:17
running the 1000.0-th loop
Current Time = 04:18:25
running the 1100.0-th loop
Current Time = 04:22:33
running the 1200.0-th loop
Current Time = 04:26:41
running the 1300.0-th loop
Current Time = 04:30:50
running the 1400.0-th loop
Current Time = 04:34:57
running the 1500.0-th loop
Current Time = 04:39:06
running the 1600.0-th loop
Current Time = 04:43:15
running the 1700.0-th loop
Current Time = 04:47:25
running the 1800.0-th loop
Current Time = 04:51:35
running the 1900.0-th loop
Current Time = 04:55:44
running the 2000

In [None]:
res1.shape

(4999, 768)

In [None]:
features1

In [None]:
features1.shape

In [None]:
from numpy import asarray
from numpy import savetxt
arr1 = asarray(res1)
savetxt('0_5000_left.csv', arr1, delimiter=',')  # UPDATE FILE NAME ACCORDINGLY (start_end_left.csv)
arr2 = asarray(res2)
savetxt('0_5000_right.csv', arr2, delimiter=',') # UPDATE FILE NAME ACCORDINGLY (start_end_left.csv)
# PLEASE DOUBLE CHECK THE NAMING CONVENTION && THANK YOU
# OPEN THE FILE FOLDER ON THE LEFT BAR, RIGHT CLICK THE TWO FILES, AND DOWNLOAD THEM (for chrome you might need to enable multiple downloads, you will get a pop-up telling you to do so)