In [1]:
import pandas as pd
data = pd.read_csv('LinkedIn_posts_moez_10sample.csv')
data.head()

Unnamed: 0,Post_ID,Text
0,1,Normalization is a technique often applied as ...
1,2,Training a Machine Learning Model in PyCaret e...
2,3,If an AI algorithm turns the copyrighted work ...
3,4,Do you know you can write nested functions in ...
4,5,Have you still not used PyCaret? Maybe it's ti...


# OpenAI

In [8]:
from openai import OpenAI
OPENAI_API_TOKEN = " "
client = OpenAI(api_key=OPENAI_API_TOKEN)

In [9]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [10]:
data['embedding'] = data['Text'].astype(str).apply(get_embedding)
data.head()

Unnamed: 0,Post_ID,Text,embedding
0,1,Normalization is a technique often applied as ...,"[0.04812345653772354, 0.050626229494810104, 0...."
1,2,Training a Machine Learning Model in PyCaret e...,"[-0.0033819079399108887, 0.007453019730746746,..."
2,3,If an AI algorithm turns the copyrighted work ...,"[0.03031008318066597, -0.03753546252846718, -0..."
3,4,Do you know you can write nested functions in ...,"[-0.0032918560318648815, -0.003921740222722292..."
4,5,Have you still not used PyCaret? Maybe it's ti...,"[0.0199144184589386, 0.009592597372829914, 0.0..."


In [11]:
data['embedding'][0]

[0.04812345653772354,
 0.050626229494810104,
 0.017442557960748672,
 0.006844200659543276,
 0.015345938503742218,
 -0.006553308572620153,
 0.004989076405763626,
 0.03077969327569008,
 -0.02300793118774891,
 0.0192757286131382,
 -0.0004912237054668367,
 -0.032514069229364395,
 -0.0471135675907135,
 0.046586669981479645,
 0.06687228381633759,
 0.025313114747405052,
 0.03420453891158104,
 0.053041182458400726,
 -0.05918833985924721,
 0.03152613341808319,
 0.029374629259109497,
 0.02487403154373169,
 -0.003482474246993661,
 -0.018792737275362015,
 0.033523958176374435,
 -0.03207498788833618,
 0.049616336822509766,
 0.002317533129826188,
 0.03549983352422714,
 -0.05343635752797127,
 -0.012645579874515533,
 -0.012173566035926342,
 -0.04807955026626587,
 -0.03657558560371399,
 0.04548896104097366,
 -0.067618727684021,
 -0.00452803960070014,
 0.04177871346473694,
 0.022470055148005486,
 0.027464618906378746,
 -0.014489727094769478,
 -0.03391913324594498,
 -0.009303063154220581,
 0.050538413226

In [12]:
len(data['embedding'][0])

1536

In [13]:
import numpy as np
np.array(data['embedding'][0])

array([ 0.04812346,  0.05062623,  0.01744256, ..., -0.00450334,
       -0.02252494, -0.00150386])

In [14]:
np.array(data['embedding'][0]).shape

(1536,)

# HuggingFace

In [15]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function to get embeddings for a piece of text using BERT
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs['last_hidden_state'][:, 0, :].squeeze().numpy()

# Get embeddings for each text
data['embedding_hf'] = data['Text'].apply(get_embedding)

data.head()



Unnamed: 0,Post_ID,Text,embedding,embedding_hf
0,1,Normalization is a technique often applied as ...,"[0.04812345653772354, 0.050626229494810104, 0....","[-0.074475475, -0.536094, -0.17853524, -0.0028..."


In [19]:
data['embedding_hf'][0]

array([-7.44754747e-02, -5.36094010e-01, -1.78535238e-01, -2.80777179e-03,
        6.99440390e-03, -4.80155796e-01, -4.02519107e-01, -4.73386869e-02,
        1.59390852e-01, -4.79776651e-01, -4.57841098e-01,  4.74882662e-01,
       -8.19539905e-01,  7.39466622e-02, -7.52716064e-01,  1.12865619e-01,
        1.70508519e-01,  2.26892799e-01, -5.45695662e-01,  3.28502730e-02,
       -3.83926779e-02, -8.15122366e-01,  1.60340354e-01, -3.16676617e-01,
       -7.44325370e-02,  9.36131030e-02,  7.75560811e-02, -2.55703062e-01,
        3.00134718e-02, -8.60095769e-02, -3.48552287e-01,  3.46851438e-01,
        7.26142079e-02, -7.22434640e-01,  4.18813109e-01, -6.34171814e-03,
        3.90033215e-01, -1.49674922e-01,  2.43479893e-01,  6.21370003e-02,
       -5.32966375e-01,  6.23367727e-02,  2.66115010e-01,  1.20965026e-01,
       -4.00713176e-01,  1.77999943e-01, -3.52272797e+00, -5.51032424e-02,
       -6.67095333e-02, -5.25501907e-01, -3.57367814e-01,  3.61472666e-01,
       -7.19314575e-01,  

In [20]:
len(data['embedding_hf'][0])

768