# Let's make embeddings from the descriptions we downloaded in Download311fromAPI.ipynb

In [1]:
#!pip install -q keras-core 
#!pip install -q --upgrade keras-nlp

In [2]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  # or "tensorflow" or "torch"

import keras_nlp
import keras_core as keras
import pandas as pd
import numpy as np
import csv

Using TensorFlow backend


2023-10-27 13:06:38.275215: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-27 13:06:38.314971: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-27 13:06:38.315696: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Initialize tokenizer and preprocessor
tokenizer = keras_nlp.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
preprocessor = keras_nlp.models.BertPreprocessor(tokenizer=tokenizer)

# Initialize the BERT backbone model from a preset
bert_backbone = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased")

# Load the data
boston_311_data = pd.read_csv('./all_311_cases_api.csv')

# Define the output CSV file path
output_csv_path = './cls_and_pooled_embeddings_with_three_cols.csv'

# Load the output CSV file if it exists
if os.path.exists(output_csv_path):
    final_embeddings_df = pd.read_csv(output_csv_path)

    # Remove the rows that have already been processed
    boston_311_data = boston_311_data[~boston_311_data['service_request_id'].isin(final_embeddings_df['service_request_id'])]

    # Reset the index
    boston_311_data.reset_index(drop=True, inplace=True)

else :
    # Initialize an empty DataFrame to hold the final embeddings
    final_embeddings_df = pd.DataFrame()

# Define the batch size
batch_size = 1000  # Adjust this based on your machine's capabilities

file_empty = not os.path.exists(output_csv_path) or os.path.getsize(output_csv_path) == 0


# Initialize the CSV file and write the header
with open(output_csv_path, 'a', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Write the header to the CSV file if it is empty
    if file_empty:
        csv_writer.writerow(['service_request_id', 'desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding'])

    # Loop through the data in batches
    for i in range(0, len(boston_311_data), batch_size):
        batch_data = boston_311_data.iloc[i:i+batch_size]
                # Initialize row data for CSV
        csv_rows = []
        
        #possible columns are service_request_id,status,service_name,service_code,description,requested_datetime,updated_datetime,address,lat,long,token
        #for now we will only use description, service_name, and service_code
        for column in ['description',  'service_name', 'service_code']:
            preprocessed_data = preprocessor(batch_data[column].astype(str))
            
            # Generate embeddings
            embeddings = bert_backbone.predict(preprocessed_data)

            # Extract [CLS] embeddings and pooled_output
            cls_embeddings = embeddings['sequence_output'][:, 0, :]
            pooled_output = embeddings['pooled_output']
            
            # Append to CSV row data
            for j, service_request_id in enumerate(batch_data['service_request_id']):
                if j >= len(csv_rows):
                    csv_rows.append([service_request_id])
                csv_rows[j].extend([cls_embeddings[j].tolist(), pooled_output[j].tolist()])
        
        # Write the batch data to the CSV file
        csv_writer.writerows(csv_rows)
# The data has been saved to the CSV file incrementally


2023-10-27 13:07:02.226487: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe4b0010140 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-10-27 13:07:02.226517: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2023-10-27 13:07:02.258696: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-10-27 13:07:02.726223: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 204ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 164ms/step
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 165ms/step


In [4]:
#use jupyter ! notation to run wc -l on the file
!wc -l $output_csv_path

291560 ./cls_and_pooled_embeddings_with_three_cols.csv


In [5]:
#function to replace the header row in a csv file
def replace_header(file_name, header):
    lines = []
    with open(file_name, 'r') as readFile:
        reader = csv.reader(readFile)
        lines = list(reader)
        lines[0] = header
    with open(file_name, 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(lines)

output_csv_path = './cls_and_pooled_embeddings_with_three_cols.csv'
#replace the header row in the csv file
#replace_header(output_csv_path, ['service_request_id', 'desc_cls_embedding', 'desc_pooled_embedding', 'name_cls_embedding', 'name_pooled_embedding', 'code_cls_embedding', 'code_pooled_embedding'])