In [1]:
import os 
import json 

from dotenv import load_dotenv
import openai
import pandas as pd
from tqdm import tqdm

load_dotenv('openai.env')
openai.api_key  = os.getenv('OPENAI_API_KEY')

### utils

In [3]:
def get_completion(prompt: str, model="gpt-3.5-turbo") -> str:
    
    """
    Get a response to the prompt
    Args:
       prompt: instructions
    return:
        response to instructions
    """

    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.7, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

def get_sample(df: pd.DataFrame, sample_size=50) -> str:
    """
    Breakdown of data into individual samples
    Args:
        df: data
        sample_size: sample size
    return table in .csv format
    """
    length = df.shape[0]
    chunk_size = length // sample_size
    remainder = df.shape[0] % sample_size
    start = 0
    
    for i in range(sample_size):
        end = start + chunk_size 
        if i < remainder:
            end += 1
        yield df.iloc[start:end, :].to_csv()
        start = end

### data processing

#### Extract data and reduce table

In [6]:
df = pd.read_csv('playbyplayv2_0012000001.csv')
sample_size = 20

samples_generator = get_sample(df=df, sample_size=sample_size)

prompt ="""
You are a professional observer of a basketball game. You always find interesting moments in the data, highlight special moments.
You are given a table in the format .csv:
{text}

Highlight the most interesting points by reducing the size of the table \
    by at least three times. Then describe these points as a professional \
        observer. In the end give only your overview
The data should be in the following format:
{{
    "source": This should be the most important data from the table. There should always be a field "PERIOD"
    "report" should be your review of this data. Start the report with a list of important events
}}

"""

processed_data = list()
for table in tqdm(samples_generator, total = sample_size ):
    prompt_for_model = prompt.format(text=table)
    res = get_completion(prompt_for_model, model="gpt-3.5-turbo")
    processed_data.append(res)
    
    

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [12:03<00:00, 36.18s/it]


In [9]:
with open('output.json', 'w') as f:
    f.write(json.dumps(processed_data))
