In [1]:
import csv
import torch
from transformers import BertTokenizer, BertModel

In [2]:
# Load the BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:

# Define a function to tokenize and encode the input text
def encode_text(text):
    # Tokenize the text
    tokens = tokenizer.encode(text, add_special_tokens=True, max_length=512)
    # Truncate or pad the tokens to a fixed length
    tokens = tokens[:512] + [0] * (512 - len(tokens))
    # Convert the token IDs to a tensor
    tokens_tensor = torch.tensor(tokens).unsqueeze(0)
    # Return the input features
    return {'input_ids': tokens_tensor}

In [4]:
# Open the CSV file and read the tweets
with open('../../go/files/#SSC_CGL_AGE_RECKONING_1_JAN.csv', 'r', encoding="UTF-8") as csvfile:
    reader = csv.reader(csvfile)
    tweets = [row[0] for row in reader]

In [5]:
# Create a list to store the embeddings
embeddings = []

# Generate embeddings for each tweet
for tweet in tweets:
    # Encode the tweet
    input_features = encode_text(tweet)
    # Pass the input features through the BERT model
    with torch.no_grad():
        output = model(**input_features)
        # Get the embeddings from the second-to-last layer
        embeddings.append(output.hidden_states[-2][0][0].tolist())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Save the embeddings to a file
with open('embeddings.txt', 'w') as file:
    for embedding in embeddings:
        file.write(','.join(str(x) for x in embedding) + '\n')