In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TFBertModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

### Read the data set

In [24]:
df_tweets = pd.read_csv("train.csv")

### Label 0 == "anxious" and Label 3 == "Lonely"
### All the models are confused between anxious and lonely
df_tweets = df_tweets[df_tweets['labels'].isin(["Anxious","Lonely"])]
df_tweets.shape

(15176, 2)

### We will use the fine tuned version of the BERT model to generate embeddings

In [42]:

tuned_model_path = r"C:\Users\raoms_y121yee\Downloads\tuned-model"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",padding_side="left",truncation_side="right")
bert_model = TFBertModel.from_pretrained(tuned_model_path)


def num_batches(total,batch_size):
    if total % batch_size == 0:
        return total // batch_size
    else:
        return total // batch_size + 1


    
    
def generate_embeddings(texts,batch_size,tokenizer=tokenizer,model=bert_model):
    total = len(texts)
    batches = num_batches(total,batch_size)
    embeddings = np.zeros((total,768))
    for b in range(batches):
        if total % batch_size == 0 or b < batches -1 :
            batch_texts = texts[b*batch_size:b*batch_size + batch_size]
            tokens = tokenizer(batch_texts,return_tensors="tf",max_length=20,padding=True,truncation=True)
            e = model(tokens).last_hidden_state[:,0,:].numpy()
            embeddings[b*batch_size:b*batch_size+batch_size, :] = e
        else:
            batch_texts = texts[b*batch_size:]
            tokens = tokenizer(batch_texts,return_tensors="tf",max_length=20,padding=True,truncation=True)
            e = model(tokens).last_hidden_state[:,0,:].numpy()
            embeddings[b*batch_size:, :] = e
            
    return embeddings


##### Generate Embeddings here
anxious = df_tweets.loc[df_tweets['labels'] == 'Anxious','tweets'].tolist()
lonely = df_tweets.loc[df_tweets['labels'] == 'Lonely','tweets'].tolist()
anxious_emb = generate_embeddings(anxious,150)
lonely_emb = generate_embeddings(lonely,150)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at C:\Users\raoms_y121yee\Downloads\tuned-model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [45]:
from sklearn.metrics import pairwise_distances

sm = 1 - pairwise_distances(anxious_emb,lonely_emb,metric="cosine")
sm

array([[0.66682042, 0.70910269, 0.62089508, ..., 0.68441209, 0.62252213,
        0.59645158],
       [0.64943828, 0.71428288, 0.60574092, ..., 0.62951312, 0.64138027,
        0.70752305],
       [0.64712961, 0.70815988, 0.65281628, ..., 0.65333793, 0.55360437,
        0.6880034 ],
       ...,
       [0.57163513, 0.59779753, 0.64126983, ..., 0.54625021, 0.57558718,
        0.68623517],
       [0.58568226, 0.71666109, 0.57453597, ..., 0.69081999, 0.53234898,
        0.57697925],
       [0.7400193 , 0.70203316, 0.61149947, ..., 0.64758908, 0.67233236,
        0.68118443]])

In [47]:
most_similar_index = np.argmax(sm,axis=1)
most_sim_score = np.max(sm,axis=1)

In [48]:
most_sim_score

array([1.        , 0.84670154, 0.86240447, ..., 0.83400858, 1.        ,
       1.        ])

In [52]:
lon = pd.Series(lonely)
lon = lon.iloc[most_similar_index]

In [55]:
res_df = pd.DataFrame({
    "Anxious": anxious,
    "Lonely": lon,
    "cosine_score": most_sim_score
})

final = res_df[res_df['cosine_score'] < 1]
final.to_csv("Similar.csv",index=False)

In [57]:
final.shape

(2120, 3)