# Sample Notebook

## install dependencies

In [None]:
%pip install transformers[torch]
%pip install beautifulsoup4
%pip install boto3

## Define helper methods

In [3]:
import os
import boto3
import io
import json

from bs4 import BeautifulSoup

def extract_text_from_html(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')
    text = ' '.join([tag.get_text() for tag in soup.find_all('section')])
    return text

def extract_text_from_s3_file(bucket_name, file_name, output_filename):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    
    # Use a streaming approach to process the file line by line
    bytestream = io.BytesIO(obj['Body'].read())
    
    with open(output_filename, 'w') as outfile:
        for line in bytestream:
            item = json.loads(line.decode('utf-8'))
            raw_html = item.get('RawHtml', '')
            text = extract_text_from_html(raw_html)
            outfile.write(text + "\n")
    
    print(f"Processing complete. Text written to: {output_filename}")
    
    # Upload the output file to S3
    s3.upload_file(output_filename, bucket_name, output_filename)
    print(f"File uploaded to: {bucket_name}/{output_filename}")
    
    # Delete the local output file
    os.remove(output_filename)
    print(f"Local file deleted: {output_filename}")


## Prepare Data

### extract text from blogs json and upload to S3

In [4]:
bucket_name = "awsc.datascience.objecjstore"
file_name = "raw_data/allBlogList20230613Detailed.json"

text_list = extract_text_from_s3_file(bucket_name, file_name, "rawtext.txt")

NameError: name 'json' is not defined

## Load Model

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = 'bert-base-uncased'

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



Run Vector

In [None]:
# Define a sample text
text = "Hello, this is a test."

# Encode the text to get the input tensors
inputs = tokenizer(text, return_tensors='pt')

# Run the text through the model to get the embeddings
outputs = model(**inputs)

# Use the average of the last hidden state as the text's embedding
embeddings = outputs.last_hidden_state.mean(dim=1)

# Convert the tensor to a numpy array
vectors = embeddings.detach().numpy()

print (vectors[:, :50])