<a href="https://www.kaggle.com/code/pavanbagli/fslm-inference-v2?scriptVersionId=177059520" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<a name='1'></a>
## 1 - Install and Imports

In [None]:
import os
import sys

In [None]:
#os.system('pip3 install --no-index --find-links ../input/httplib2v17 httplib2==0.17.4')
#os.system('pip3 install -U --no-index --find-links ../input/pyparsing247 pyparsing==2.4.7')
#os.system('pip3 install -U --no-index --find-links ../input/grpcio grpcio')

In [None]:
import httplib2
import pyparsing
import apache_beam as beam

print('httplib2 version: {}'.format(httplib2.__version__))
print('pyparsing version: {}'.format(pyparsing.__version__))
print('beam version: {}'.format(beam.__version__))

In [None]:
#os.system('pip3 install --no-index --find-links ../input/tfx1800/tfx1800wheel/kaggle/working tfx==1.8.0')
#os.system('pip3 install --no-index --find-links ../input/tensorflowtext2600/kaggle/working tensorflow-text==2.6.0')
os.system('pip3 install --no-index --find-links ../input/tftext/tensorflowtext2600wheel/kaggle/working tensorflow-text==2.6.0')
#!pip3 install --no-index --find-links ../input/tftext/tensorflowtext2600wheel/kaggle/working tensorflow-text==2.6.0



In [None]:
import csv
import pandas as pd
import numpy as np
import argparse
from typing import Dict, Text, Any, Tuple, List
from apache_beam.options.pipeline_options import PipelineOptions

import tensorflow as tf
import tensorflow_text
from tensorflow import keras
from tensorflow.core.example import example_pb2

import tfx_bsl
from tfx_bsl.public.beam import RunInference
from tfx_bsl.public import tfxio
from tfx_bsl.public.proto import model_spec_pb2

tf.get_logger().setLevel('ERROR')

print('TensorFlow version: {}'.format(tf.__version__))
print('Beam version: {}'.format(beam.__version__))

In [None]:
MODEL_DIR = '../input/fslm-basemodel-v2/model/6/Format-Serving'

INPUT_SOURCE_DIR = '../input/foursquare-location-matching'
INPUT_SOURCE_TEST_DATA = f'{INPUT_SOURCE_DIR}/test.csv'
INPUT_SOURCE_PAIRS_DATA = f'{INPUT_SOURCE_DIR}/pairs.csv'

VALIDATE_DIR = './'
VALIDATE_DATA = f'{VALIDATE_DIR}/validate.csv'

SERVING_DIR = './serving'
SERVING_DATA = f'{SERVING_DIR}/serve.csv'

TFRECORD_DIR = './tfrecords' 
OUTPUT_RESULTS_DIR = './'
OUTPUT_RESULTS = f'{OUTPUT_RESULTS_DIR}/output-'

!mkdir -p {VALIDATE_DIR}
!mkdir -p {SERVING_DIR}
!mkdir -p {TFRECORD_DIR}


<a name='2'></a>
## 2 - Load Saved Model

In [None]:
model = tf.keras.models.load_model(MODEL_DIR)
#model.summary()

In [None]:
#model.get_layer('transform_features_layer') is model.tft_layer

<a name='3'></a>
## 3 - Load Data for Inference

In [None]:
def generate_pairs(df_1, df_2):
  pairs_temp=df_1.merge(df_2, how='cross', suffixes=('_1', '_2'))
  pairs_temp = pairs_temp[pairs_temp['id_1'] != pairs_temp['id_2']]
  #combine address columns : address_1, city_1, state_1, zip_1 to full_address_1
  pairs_temp['full_address_1'] = pairs_temp['address_1'].map(str) + " " + pairs_temp['city_1'].map(str) + " " + pairs_temp['state_1'].map(str) + " " + pairs_temp['zip_1'].map(str) + " Phone: " + pairs_temp['phone_1'].map(str)
  pairs_temp['full_address_2'] = pairs_temp['address_2'].map(str) + " " + pairs_temp['city_2'].map(str) + " " + pairs_temp['state_2'].map(str) + " " + pairs_temp['zip_2'].map(str) + " Phone: " + pairs_temp['phone_2'].map(str)
  pairs_temp = pairs_temp.drop(columns=['address_1','city_1','state_1','zip_1', 'phone_1', 'address_2','city_2','state_2','zip_2', 'phone_2'], axis=1)
  pairs_temp.to_csv(SERVING_DATA, index=False)
  del pairs_temp

In [None]:
#def csv_to_tfrecord(schema, csv_file, tfrecord_file):
def csv_to_tfrecord(csv_file, tfrecord_file):
  ''' Converts a csv file into a tfrecord
  Args:
    csv_file (string) - file to convert to tfrecord
    tfrecord_file (string) - filename of tfrecord to create

  Returns:
    filename of tfrecord
  '''
  float_features = ['latitude_1','latitude_2','longitude_1','longitude_2']
  byte_features = ['categories_1','categories_2','country_1','country_2','full_address_1','full_address_2','id_1','id_2','name_1','name_2','url_1','url_2']

  # Open CSV file for reading. Each row is mapped as a dictionary.
  reader = csv.DictReader(open(csv_file, 'r'))
  
  # Initialize TF examples list
  examples = []

  # For each row in CSV, create a TF Example based on
  # the Schema and append to the list
  for line in reader:
    # Intialize example
    example = example_pb2.Example()

    for feature in float_features:
      key=feature
      example.features.feature[key].float_list.value[:] = (
              [float(line[key])] if len(line[key]) > 0 else [])

    for feature in byte_features:
      key=feature
      example.features.feature[key].bytes_list.value[:] = (
              [line[key].encode('utf8')] if len(line[key]) > 0 else [])

    # Append to the list
    examples.append(example)

  # Write examples to tfrecord file
  with tf.io.TFRecordWriter(tfrecord_file) as writer:
    for example in examples:
      writer.write(example.SerializeToString())
  
  return tfrecord_file

In [None]:
class ToCsvFn(beam.DoFn):
    def process(self, element):
        # This parses out the example features that were passed in during prediction
        example = tf.io.parse_single_example(tf.make_ndarray(element.predict_log.request.inputs['examples'])[0], features)
        example_values = [ v.numpy() for v in example.values() ]
        example_values = [ x.decode('utf-8') if isinstance(x, bytes) else x for x in example_values ]
        predictions = tf.make_ndarray(element.predict_log.response.outputs['outputs'])
        full_result = example_values + list(predictions[0])
        yield ','.join([ str(f) for f in full_result])

In [None]:
def predictions():
  pipeline = beam.Pipeline()
  tfexample_beam_record = tfx_bsl.public.tfxio.TFExampleRecord(file_pattern='./tfrecords/*.tfrecord')

  with pipeline as p:
    _ = (p | tfexample_beam_record.RawRecordBeamSource()
            | RunInference(
                model_spec_pb2.InferenceSpecType(
                    saved_model_spec=model_spec_pb2.SavedModelSpec(model_path=MODEL_DIR)))
            | beam.ParDo(ToCsvFn())
            | beam.io.WriteToText(OUTPUT_RESULTS,header='id_1,id_2,match')
            | beam.Map(print)
        )


In [None]:
def create_output_dict(rows):
    output = {}
    match_ids = []
    row_id_prev = 'Null'
    for row in rows:
        if row_id_prev == 'Null':
            row_id_prev = row[0]
            match_ids.append(row[0])
            
        
        if row[0] == row_id_prev:
            
            if row[2] > 0.5:
                match_ids.append(row[1])
        
        else:
            
            output[row_id_prev] = ' '.join(match_ids)
            match_ids = []
            match_ids.append(row[0])
            if row[2] > 0.5:
                match_ids.append(row[1])
        
        row_id_prev = row[0]
        
    output[row_id_prev] = ' '.join(match_ids)
    return output

In [None]:
def write_csv_file(output_dir, output_dict):
    with open(f'{OUTPUT_RESULTS_DIR}/submission.csv', 'a') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in output_dict.items():
            writer.writerow([key, value])

In [None]:
with open(f'{OUTPUT_RESULTS_DIR}/submission.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['id', 'matches'])

In [None]:

inf_df = pd.read_csv(INPUT_SOURCE_TEST_DATA)
inf_df.drop_duplicates(inplace=True)
features = {
    'id_1': tf.io.FixedLenFeature([], tf.string),
    'id_2': tf.io.FixedLenFeature([], tf.string),
     }

In [None]:
for i in range(0,inf_df.shape[0],10):
    
    generate_pairs(inf_df[i:i+10], inf_df)

  # Create list of tfrecord files
  #tfrecord_files = [csv_to_tfrecord(f'{SERVING_DIR}/{name}', f"{TFRECORD_DIR}/{name + str(i).replace('csv','tfrecord')}") 
    tfrecord_files = [csv_to_tfrecord(f'{SERVING_DIR}/{name}', f"{TFRECORD_DIR}/{str(i) + '.tfrecord'}") 
        for name in os.listdir(SERVING_DIR)]

    predictions()
    
    pred_df = pd.read_csv('./output--00000-of-00001')
    pred_df = pred_df.sort_values(['id_1', 'id_2', 'match'])
    rows = pred_df.values
    output_dict = create_output_dict(rows)
    write_csv_file(OUTPUT_RESULTS_DIR, output_dict)

    del tfrecord_files
    del pred_df
    del rows
    del output_dict
    