In [None]:
%pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [None]:
#import pandas as pd
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm  # Import tqdm for progress bar
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model =  SentenceTransformer("dangvantuan/sentence-camembert-base", device=device)

# Load the entire labelled dataset
df = pd.read_csv("data/entire_dataset_labeled_final.csv")
# Filter the rows with labels == 1
df = df[df['labels'] == 1]

df['created_at'] = pd.to_datetime(df['created_at'])  # Convert 'created_at' to datetime
df['week'] = df['created_at'].dt.strftime('%Y-%U')  # Extract the week

# Tokenize the tweets into chunks and create embeddings
chunk_size = 512  # Adjust the chunk size based on the model's maximum token limit
embeddings = []

# Initialize tqdm with the total number of tweets
progress_bar = tqdm(total=len(df), desc="Processing Tweets", position=0, leave=True)

with torch.no_grad():  # Ensure that no gradients are calculated for better performance
    for tweet in df['tweet_clean0']:
        # Tokenize the tweet into chunks
        tokenized_chunks = [tweet[i:i+chunk_size] for i in range(0, len(tweet), chunk_size)]

        # Create embeddings for each chunk
        chunk_embeddings = [model.encode(chunk) for chunk in tokenized_chunks]

        # Average the embeddings to get a single embedding for the entire tweet
        if chunk_embeddings:
            avg_embedding = sum(chunk_embeddings) / len(chunk_embeddings)
            embeddings.append(avg_embedding)
        else:
            embeddings.append([])

        # Update the progress bar
        progress_bar.update(1)

# Close the progress bar
progress_bar.close()

# Assign the embeddings to the DataFrame
df['embeddings'] = embeddings

In [None]:
df.head()

In [None]:
df.to_pickle("data/df_sentenceembeddings.pkl")