In [4]:
import csv
import torch
from transformers import AutoTokenizer, AutoModel

In [5]:
# Load pre-trained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Define a function to create embeddings for each tweet
def create_avg_embeddings(tweet):
    # Tokenize the tweet and convert tokens to IDs
    inputs = tokenizer.encode_plus(tweet, add_special_tokens=True, return_tensors='pt')

    # Pass the input IDs through the model to get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()

    # Convert embeddings to a numpy array and return it
    return embeddings.mean(dim=0).numpy()

In [1]:
# Define the path to the folder containing the text files
data_folder = 'data/'

In [2]:
import os

In [8]:
# Loop through each text file in the data folder and create average embeddings for it
for file_name in os.listdir(data_folder):
    # Check if the path is a file and continue only if it is
    file_path = os.path.join(data_folder, file_name)
    if not os.path.isfile(file_path):
        continue

    # Create average embeddings for the current text file
    avg_embeddings = create_avg_embeddings(file_path)
    print(avg_embeddings)

-0.015125579
-0.0128026055
-0.014571284


In [16]:
import numpy as np

In [20]:
with open('../../go/files/#SSC_CGL_AGE_RECKONING_1_JAN.csv', 'r', encoding="UTF-8") as csvfile:
    reader = csv.reader(csvfile)
    tweets = [row[0] for row in reader]
    embeddings = []
    for tweet in tweets:
        ans = create_avg_embeddings(tweet)
        embeddings.append(ans.tolist())

In [28]:
import glob

In [39]:
import os
import csv

# Loop through all the text files in the directory
directory = '../../go/files/'
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Read in the tweets from the file
        with open(os.path.join(directory, filename), 'r', encoding="UTF-8") as file:
            tweets = [line.strip() for line in file.readlines()]

        # Create embeddings for each tweet and store in a list
        embeddings = [create_avg_embeddings(tweet) for tweet in tweets]

        # Save the embeddings in a CSV file with the same name as the text file
        output_filename = os.path.join('../../src/python', 'embeddings_' + os.path.splitext(filename)[0] + '.csv')
        with open(os.path.join(directory, output_filename), 'w', newline='', encoding='UTF-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows([embeddings])
