# Let's make embeddings from the descriptions we downloaded in Download311fromAPI.ipynb

In [5]:
!pip install -q keras-core 
!pip install -q --upgrade keras-nlp

In [6]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  # or "tensorflow" or "torch"

import keras_nlp
import keras_core as keras
import pandas as pd
import numpy as np
import csv

In [9]:
# Initialize tokenizer and preprocessor
tokenizer = keras_nlp.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
preprocessor = keras_nlp.models.BertPreprocessor(tokenizer=tokenizer)

# Initialize the BERT backbone model from a preset
bert_backbone = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased")

# Load the data
boston_311_data = pd.read_csv('./all_311_cases_api.csv')

# Define the output CSV file path
output_csv_path = './cls_and_pooled_embeddings_with_service_id.csv'

# Load the output CSV file if it exists
if os.path.exists(output_csv_path):
    final_embeddings_df = pd.read_csv(output_csv_path)

    # Remove the rows that have already been processed
    boston_311_data = boston_311_data[~boston_311_data['service_request_id'].isin(final_embeddings_df['service_request_id'])]

    # Reset the index
    boston_311_data.reset_index(drop=True, inplace=True)

else :
    # Initialize an empty DataFrame to hold the final embeddings
    final_embeddings_df = pd.DataFrame()

# Define the batch size
batch_size = 1000  # Adjust this based on your machine's capabilities

file_empty = not os.path.exists(output_csv_path) or os.path.getsize(output_csv_path) == 0


# Initialize the CSV file and write the header
with open(output_csv_path, 'a', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write the header to the CSV file if it is empty
    if file_empty:
        csv_writer.writerow(['service_request_id', 'cls_embedding', 'pooled_embedding'])

    # Loop through the data in batches
    for i in range(0, len(boston_311_data), batch_size):
        batch_data = boston_311_data.iloc[i:i+batch_size]
        preprocessed_data = preprocessor(batch_data['description'].astype(str))

        # Generate embeddings
        embeddings = bert_backbone.predict(preprocessed_data)

        # Extract [CLS] embeddings and pooled_output
        cls_embeddings = embeddings['sequence_output'][:, 0, :]
        pooled_output = embeddings['pooled_output']

        # Write the batch data to the CSV file
        for j in range(len(batch_data)):
            row_data = [
                batch_data.iloc[j]['service_request_id'],
                cls_embeddings[j].tolist(),
                pooled_output[j].tolist()
            ]
            csv_writer.writerow(row_data)

# The data has been saved to the CSV file incrementally


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 178ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 172ms/step


In [8]:
#use jupyter ! notation to run wc -l on the file
!wc -l $output_csv_path

241115 ./cls_and_pooled_embeddings_with_service_id.csv
