# Let's make embeddings from the descriptions we downloaded in Download311fromAPI.ipynb

In [1]:
!pip install -q keras-core 
!pip install -q --upgrade keras-nlp

In [2]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  # or "tensorflow" or "torch"

import keras_nlp
import keras_core as keras
import pandas as pd
import numpy as np
import csv

Using TensorFlow backend


2023-10-02 20:00:05.057381: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 20:00:05.091486: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-02 20:00:05.091988: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Initialize tokenizer and preprocessor
tokenizer = keras_nlp.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
preprocessor = keras_nlp.models.BertPreprocessor(tokenizer=tokenizer)

# Initialize the BERT backbone model from a preset
bert_backbone = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased")

# Load the data
boston_311_data = pd.read_csv('./boston_311_data_predict.csv')

# Define the output CSV file path
output_csv_path = './cls_and_pooled_embeddings_with_service_id.csv'

# Initialize an empty DataFrame to hold the final embeddings
final_embeddings_df = pd.DataFrame()

# Define the batch size
batch_size = 1000  # Adjust this based on your machine's capabilities

# Initialize the CSV file and write the header
with open(output_csv_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write the header to the CSV file
    csv_writer.writerow(['service_request_id', 'cls_embedding', 'pooled_embedding'])

    # Loop through the data in batches
    for i in range(0, len(boston_311_data), batch_size):
        batch_data = boston_311_data.iloc[i:i+batch_size]
        preprocessed_data = preprocessor(batch_data['description'].astype(str))

        # Generate embeddings
        embeddings = bert_backbone.predict(preprocessed_data)

        # Extract [CLS] embeddings and pooled_output
        cls_embeddings = embeddings['sequence_output'][:, 0, :]
        pooled_output = embeddings['pooled_output']

        # Write the batch data to the CSV file
        for j in range(len(batch_data)):
            row_data = [
                batch_data.iloc[j]['service_request_id'],
                cls_embeddings[j].tolist(),
                pooled_output[j].tolist()
            ]
            csv_writer.writerow(row_data)

# The data has been saved to the CSV file incrementally


2023-10-02 20:00:07.444182: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f2f0c0103a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-10-02 20:00:07.444206: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2023-10-02 20:00:07.458849: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m40s[0m 1s/step

2023-10-02 20:00:07.844721: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 171ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 153ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 152ms/step
[1m32/32[0m [32m━━━━━━

In [4]:
#use jupyter ! notation to run wc -l on the file
!wc -l $output_csv_path

27269 ./cls_and_pooled_embeddings_with_service_id.csv
