In [1]:
import sys
import pandas as pd
from pyspark.sql import SparkSession,  DataFrame, functions as F
from pyspark.sql.types import DoubleType


In [2]:
# functions for building & parsing request/response

import csv
import io
def list_of_dicts_to_csv(data):
    output = io.StringIO()
    keys = data[0].keys()
    dict_writer = csv.DictWriter(output, keys)
    dict_writer.writeheader()
    dict_writer.writerows(data)
    return output.getvalue()

def build_model_api_payload_csv(dataset_context:str, table:str, columns:list) -> dict: 
    '''
    Returns a dictionary that can be used as the payload for the LLM API request
    '''
    payload = {
        "context": dataset_context,
        "tables": [
            {
                "name": table,
                "column_csv": list_of_dicts_to_csv(columns)
            } 
        ]
    }
    return payload

def csv_to_json_array(csv_string):
    csv_reader = csv.reader(csv_string.splitlines())
    field_names = next(csv_reader)
    values = []
    for row in csv_reader:
        values.append(dict(zip(field_names, row)))
    return values

import requests
import json

def post_json_request(url, payload, additional_headers=None):
    # Set default headers
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json',
    }

    # Merge with additional headers if provided
    if additional_headers:
        headers.update(additional_headers)
        
    payload_json = json.dumps(payload)

    # Perform the POST request
    response = requests.post(url, data=payload_json, headers=headers)

    # Return the response
    return response

import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [3]:
spark = SparkSession.builder.appName("testing_model_api").getOrCreate()

23/12/19 16:28:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
spark.catalog.listCatalogs()

[CatalogMetadata(name='lakehouse', description=None),
 CatalogMetadata(name='spark_catalog', description=None)]

In [6]:
df = spark.read.table("lakehouse.kaggle_airbnb.listings")
df.toPandas().head()

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
23/12/19 16:29:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,host_id,...,review_scores_communication,review_scores_location,review_scores_value,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,,,956883,...,10.0,9.0,10.0,WASHINGTON,False,moderate,False,False,2,4.07
1,953595,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",What's up with the free pillows? Our home was...,"Convenient bus stops are just down the block, ...",5177328,...,10.0,10.0,10.0,WASHINGTON,False,strict,True,True,6,1.48
2,3308979,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,Our house is located just 5 short blocks to To...,A bus stop is just 2 blocks away. Easy bus a...,16708587,...,10.0,10.0,10.0,WASHINGTON,False,strict,False,False,2,1.15
3,7421966,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,,,9851441,...,,,,WASHINGTON,False,flexible,False,False,1,
4,278830,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,Belltown,The nearest public transit bus (D Line) is 2 b...,1452570,...,10.0,9.0,9.0,WASHINGTON,False,strict,False,False,1,0.89


In [7]:
df_column_types_3=df.dtypes[:3]
df_column_types_3

[('id', 'int'), ('name', 'string'), ('summary', 'string')]

In [8]:
columns = [{"name":c[0], "data_type":c[1]} for c in df_column_types_3]
columns

[{'name': 'id', 'data_type': 'int'},
 {'name': 'name', 'data_type': 'string'},
 {'name': 'summary', 'data_type': 'string'}]

In [9]:
# payload to be sent to model api service
payload = build_model_api_payload_csv("AirBnB", "listings",columns)
payload

{'context': 'AirBnB',
 'tables': [{'name': 'listings',
   'column_csv': 'name,data_type\r\nid,int\r\nname,string\r\nsummary,string\r\n'}]}

In [13]:
tokens=num_tokens_from_string(json.dumps(payload), "cl100k_base" )
print(f"# of tokens for payload (without base prompt) : {tokens}")

# of tokens for payload (without base prompt) : 43


In [19]:
# sending payload
URL = "http://ingress-nginx-controller.ingress-nginx.svc.cluster.local:80/api/v1/models/column_analysis/tokens"
response = post_json_request(URL,  payload)
response

<Response [200]>

In [21]:
response.json()

{'num_tokens': 35}

In [18]:
# sending payload
URL = "http://ingress-nginx-controller.ingress-nginx.svc.cluster.local:80/api/v1/models/column_analysis"
response = post_json_request(URL,  payload)
response

<Response [200]>

In [10]:
content_json = csv_to_json_array(response.json()["content"])
pd.set_option('display.max_colwidth', 100)
df = pd.DataFrame(content_json)
df

Unnamed: 0,name,description
0,id,Unique identifier for each listing
1,name,Name or title of the listing
2,summary,Brief summary or description of the listing


In [11]:
usage =response.json()["usage"]
usage

{'completion_tokens': 32, 'prompt_tokens': 223, 'total_tokens': 255}

In [12]:
spark.stop()