In [1]:
import openai
import pandas as pd
from typing import List
import concurrent.futures

In [2]:
def set_model_key():
    openai.api_type = "azure"
    openai.api_base = "https://shu-audrey-joey.openai.azure.com/"
    openai.api_version = "2023-07-01-preview"
    openai.api_key = "54a4a22814764408b727b9791ed2a544"

In [8]:
%pip install duckdb

Collecting duckdb
  Downloading duckdb-0.10.1-cp38-cp38-macosx_10_9_x86_64.whl (16.6 MB)
[K     |████████████████████████████████| 16.6 MB 243.5 MB/s eta 0:00:01
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-0.10.1
Note: you may need to restart the kernel to use updated packages.


# Example of DuckDB UDF and List

In [26]:
import duckdb
from duckdb.typing import *

def my_function(x: List) -> str:
    return x

duckdb.create_function("my_func", my_function)
sample_list = [32, 42]
duckdb.sql(f"SELECT my_func({sample_list})")

┌──────────────────────────────────┐
│ my_func(main.list_value(32, 42)) │
│             varchar              │
├──────────────────────────────────┤
│ [32, 42]                         │
└──────────────────────────────────┘

In [10]:
import duckdb

# Establish a connection to the database (this creates an in-memory database if no path is provided)
conn = duckdb.connect(database=':memory:', read_only=False)

# You would typically load your data here or ensure that your database contains the necessary tables.
# For demonstration, I'll create a sample table and insert some data into it.
conn.execute("CREATE TABLE example_table (example_column INT)")
conn.execute("INSERT INTO example_table VALUES (1), (2), (3), (4), (5)")

# Now, let's convert a column from the table into a Python list
query_result = conn.execute("SELECT example_column FROM example_table").fetchall()

# Convert the result into a list (note that fetchall returns a list of tuples, so we extract the first element)
example_list = [row[0] for row in query_result]

# Close the connection
conn.close()

# Now you have your column data in a Python list
print(example_list)

[1, 2, 3, 4, 5]


# Read Dataset

In [11]:
# Path to your DuckDB database file
duckdb_db_path = 'chinook.duckdb'
conn = duckdb.connect(duckdb_db_path)
query = "SELECT * FROM Albums;"
df = conn.execute(query).df()
print(df.head())

   album_id                                  title  artist_id
0         1  For Those About To Rock We Salute You          1
1         2                      Balls to the Wall          2
2         3                      Restless and Wild          2
3         4                      Let There Be Rock          1
4         5                               Big Ones          3


In [12]:
query = "SELECT * FROM Artists;"
df = conn.execute(query).df()
print(df.head())

   artist_id               name
0          1              AC/DC
1          2             Accept
2          3          Aerosmith
3          4  Alanis Morissette
4          5    Alice In Chains


# Read Movies Dataset (small)

In [55]:
# df = duckdb.read_csv("movies_small.csv")
query = "SELECT * FROM read_csv('movies_small.csv')"
df = conn.execute(query)
print(df.head())

   column0        rotten_tomatoes_link  \
0        0  m/1013884-mighty_joe_young   
1        1  m/amityville_the_awakening   
2        2  m/amityville_the_awakening   
3        3        m/after_the_thin_man   
4        4     m/1019017-silver_streak   

                                      review_content  \
0                                               None   
1  It's a series of routines within a routine for...   
2  A vulgar exercise of terror that, despite its ...   
3  After the Thin Man hasn't quite the spontaneit...   
4  You never really get angry at it. You just wan...   

                 movie_title  \
0           Mighty Joe Young   
1  Amityville: The Awakening   
2  Amityville: The Awakening   
3         After the Thin Man   
4              Silver Streak   

                                          movie_info     id  
0  Jill Young (Terry Moore) has raised Joe, her g...  56894  
1  When some footage dating back to 1976 is disco...  83114  
2  When some footage dating bac

In [45]:
import json
x = json.dumps({'a': 'apple', 'b': 'ball'})
z = json.loads(x)
print([y for y in z])
print([z[y] for y in z])

['a', 'b']
['apple', 'ball']


# UDF

In [70]:
def llm(query: str, contextargs: str) -> str:
    """
    query: e.x. find the artist mentioned in the {review}
    context: e.x. List[review]
    """
    
#     set_model_key()
#     llm_chain = llm_dict[query]
#     output = llm_chain.batch(context)

    context = json.loads(contextargs)
    fields = []
    values = []
    for arg in context:
        fields.append(arg)
        values.append(context[arg])
    
    print("Query: ", query)
    print("Fields: ", fields)
    print("Values: ", values)

    def invoke_llm(query: str, context: List[str], fields: List[str]):
        """
        query: e.x. find the artist mentioned in the {review}
        context: e.x. review 
        """
        set_model_key()

        # TODO UPDATE THIS SYSTEM PROMPT
        response_format = """
            You are a data analysis assistant who will respond with the artist name mentioned on reviews inputted. Answer with only the name.
            For instance:
            - If the review is 'I love the songs by Taylor Swift.', the answer is 'Taylor Swift'.
            - If the review is 'This album reminds me of the Beatles', the answer is 'the Beatles'.
        """
        
        num_fields = len(fields)
        prompt = query

        for i in range(num_fields):
            field_val = context[i] if context[i] else "None"
            if isinstance(field_val, list):
                field_val = field_val[0]
            prompt += fields[i] + ": " + field_val + "\n"
        if len(prompt) > 16000:
            prompt = query + "N/A"

        response = openai.ChatCompletion.create(
            engine=DEPLOYMENT_NAME, # engine = "deployment_name".
            messages=[
                {"role": "system", "content": response_format},
                {"role": "user", "content": prompt}
            ]
        )

        output = response['choices'][0]['message']['content']
        return output

    
    responses = []
    # We can use a with statement to ensure threads are cleaned up promptly
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        # Start the load operations and mark each future with its URL
        future_to_response = {executor.submit(invoke_llm, query, row, fields): row for row in context}
        for future in concurrent.futures.as_completed(future_to_response):
            row = future_to_response[future]
            try:
                data = future.result()
                responses.append(data)
            except Exception as exc:
                print('%r generated an exception: %s' % (row, exc))
    
    
    return responses

In [72]:
duckdb.create_function("llm", llm, [str, str], str)

<duckdb.duckdb.DuckDBPyConnection at 0x7ff2d813d930>

In [None]:
duckdb.sql("CREATE TABLE movies AS SELECT * FROM read_csv('movies_small.csv')")

In [73]:
prompt_str = "Given the following movie description and review, recommend movies for the user: "
query = f"SELECT llm('{prompt_str}', json_object('movie_info', movie_info, 'review_content', review_content)) AS llm_results FROM movies"

In [74]:
results = duckdb.sql(query).fetchall()
print(results)

Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ['Jill Young (Terry Moore) has raised Joe, her gentle pet gorilla, since he was a baby. When a Hollywood, Calif., nightclub owner (Robert Armstrong) travels to Africa and notices the amazing animal, he pursuades Jill to send Joe to Hollywood. But Joe grows restless performing for crowds and finally breaks free from captivity, wreaking havoc on the city. To save Joe from the authorities, who want him dead, Jill must create an elaborate plan to send him back to Africa.', None]
'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ['When some footage dating back to 1976 is discovered, the case of the haunted house

'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ['Ray and Danny (Gregory Hines, Billy Crystal) are two Chicago police detectives hot on the trail of drug kingpin Julio Gonzales (Jimmy Smits). They manage to nab the drug lord, but the bust is messy, and the two are suspended. While vacationing in Key West, Fla., they decide to retire from the police force and open up a bar. But when they return to Chicago and find out that Gonzales has been released on bail, they vow not to quit the force until the dangerous dealer is behind bars for good.', None]
'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description 

'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ["A surfer (Trevor Wright) finds comfort from his problems in the arms of his best friend's older brother (Brad Rowe).", "It feels like it's written by a film school freshman that's trying to be innovative, but everything feels forced and flat."]
'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ["Brilliant in his studies and dismissive of athletics, Ronald (Buster Keaton) finishes high school at the top of his class. But in college his uptight attitude do

'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ["Brilliant in his studies and dismissive of athletics, Ronald (Buster Keaton) finishes high school at the top of his class. But in college his uptight attitude doesn't win him any points with his sports-loving classmates, and pretty coed Mary (Anne Cornwall) ignores him in favor of brutish jock Jeff (Harold Goodwin). Hoping to impress Mary, Ronald makes a buffoon of himself at every sport imaginable, usually at Jeff's hand. But Ronald gets a final shot at glory in a big rowing race.", "The finale ties earlier elements together in a clever way, but the episodic film doesn't match the best of Keaton's films that establish a more cohesive storyline"]
'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'D

Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ["Devdas (Shahrukh Khan) makes his way back home to India after spending 10 years studying in London. He plans to marry Paro (Aishwarya Rai), his childhood best friend, but his parents (Vijay Crishna, Smita Jaykar) do not want Devdas to marry her. They believe that Paro's family, who descend from a line of dancers, is of a lower class than their own. Eventually, Paro marries another man, and the despondent Devdas descends into life-threatening alcoholism.", 'The song and dance numbers alone are worth the price of entry.']
'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ['When German knights invade Russia,

'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ['A mild-mannered bank executive (Aaron Eckhart) mentors a teenage con artist and tries to make a career change as a doughnut merchant.', 'Male midlife crisis presents as pathological self-loathing in Meet Bill, an imperative to which the only sane response is: No thanks.']
'movie_info' generated an exception: name 'DEPLOYMENT_NAME' is not defined
'review_content' generated an exception: name 'DEPLOYMENT_NAME' is not defined
Query:  Given the following movie description and review, recommend movies for the user: 
Fields:  ['movie_info', 'review_content']
Values:  ['Jack Bauer (Kiefer Sutherland) tries to prevent an African warlord from recruiting children for his militia.', "Jack's heroism this time out is just too grandiose-the title is the tip-off. It's not a phone re

In [None]:
context = {}
fields = ["movie_info", "review_content"]
types = ["VARCHAR", "VARCHAR"]
for i in len(fields):
    context[fields[i]] = types[i]
duckdb.struct_type(context)