# Generate Embeddings
This notebook loads data from S3, generates embeddings, then uploads them to S3.

## Update PYTHONPATH
Until this notebook has a setup.py, insert parent directory onto PYTHONPATH

In [2]:
import sys

sys.path.append("../")

## Load labelled data from S3
Load previously generated data from an S3 bucket.

In [24]:
import boto3
import io
import pandas as pd

s3 = boto3.client("s3")
obj = s3.get_object(Bucket="ds-rg271", Key="data/labelled/mebank_tweets_1_year_labelled.csv")
df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=0)

## Download and load a pretrained model
Use an already trained Sentence Transformer model to embed sentences.

In [26]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("paraphrase-mpnet-base-v2")

In [27]:
embeddings = model.encode(df["content"].values.tolist())

## Save embeddings to S3
Save the embeddings to our S3 bucket.

In [30]:
import pickle

arr = io.BytesIO()
pickle.dump(embeddings, arr)
arr.seek(0)
s3.upload_fileobj(arr, "ds-rg271", "data/embeddings/mebank_1_year_tweets_embeddings.pkl")