In [1]:
import os 
import json 
import time 

from dotenv import load_dotenv
import openai
import pandas as pd
from tqdm import tqdm

load_dotenv('openai.env')
openai.api_key  = os.getenv('OPENAI_API_KEY')

In [15]:
pd.read_csv('playbyplayv2_0012000001.csv')

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,PLAYER2_TEAM_NICKNAME,PLAYER2_TEAM_ABBREVIATION,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,VIDEO_AVAILABLE_FLAG
0,12000001,2,12,0,1,7:11 PM,12:00,,Start of 1st Period (7:11 PM EST),,...,,,0,0,,,,,,0
1,12000001,4,10,0,1,7:11 PM,12:00,Jump Ball Capela vs. Vucevic: Tip to Young,,,...,Magic,ORL,4,1629027,Trae Young,1.610613e+09,Atlanta,Hawks,ATL,1
2,12000001,7,5,1,1,7:11 PM,11:46,Young Bad Pass Turnover (P1.T1),,Fournier STEAL (1 STL),...,Magic,ORL,0,0,,,,,,1
3,12000001,9,2,63,1,7:11 PM,11:27,,,MISS Fournier 3PT Fadeaway Jumper,...,,,0,0,,,,,,1
4,12000001,10,4,0,1,7:11 PM,11:23,Capela REBOUND (Off:0 Def:1),,,...,,,0,0,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,12000001,762,3,12,4,9:27 PM,0:07,,,Bone Free Throw 2 of 2 (5 PTS),...,,,0,0,,,,,,1
537,12000001,763,9,1,4,9:27 PM,0:07,HAWKS Timeout: Regular (Full 7 Short 0),,,...,,,0,0,,,,,,0
538,12000001,764,2,1,4,9:29 PM,0:04,MISS Goodwin 25' 3PT Jump Shot,,,...,,,0,0,,,,,,1
539,12000001,765,4,0,4,9:29 PM,0:01,,,Mane REBOUND (Off:0 Def:2),...,,,0,0,,,,,,1


### utils

In [13]:
def get_completion(prompt: str, model="gpt-3.5-turbo") -> str:
    
    """
    Get a response to the prompt
    Args:
       prompt: instructions
    return:
        response to instructions
    """

    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.7, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

def get_sample(df: pd.DataFrame, sample_size=50) -> str:
    """
    Breakdown of data into individual samples
    Args:
        df: data
        sample_size: sample size
    return:
        table in .csv format
    """
    length = df.shape[0]
    chunk_size = length // sample_size
    remainder = df.shape[0] % sample_size
    start = 0
    
    for i in range(sample_size):
        end = start + chunk_size 
        if i < remainder:
            end += 1
        yield df.iloc[start:end, :].to_csv()
        start = end

### data processing

#### Extract data and reduce table 

In [62]:
df = pd.read_csv('playbyplayv2_0012000001.csv')
sample_size = 20

samples_generator = get_sample(df=df, sample_size=sample_size)

prompt ="""
You are given a table in the format .csv:
{text}

The data should be in the following format:
{{
    "PERIOD": This contains only the following field: "PERIOD". In format: "PERIOD": 1
    "report" Here is the list of players for which describes what interesting and important they have done.\
            In format: "Player_name": event
}}

"""

processed_data = list()
iter = 0
for table in tqdm(samples_generator, total = sample_size ):
    # to bypass the api appeal limit
    if iter%4==0:
        time.sleep(25)
        
    prompt_for_model = prompt.format(text=table)
    res = get_completion(prompt_for_model, model="gpt-3.5-turbo")
    processed_data.append(res)  
    iter+=1 

# with open('output.json', 'w') as f:
#     f.write(json.dumps(cleaned_data))

100%|██████████| 20/20 [07:00<00:00, 21.01s/it]


#### transform output in DataFrame format

In [71]:
df_player_event = pd.concat([pd.DataFrame(json.loads(data)) for data in processed_data], axis=0)

Unnamed: 0,PERIOD,report
Aaron Gordon,1,MISS Gordon 16' Pullup Jump Shot
Bogdan Bogdanovic,1,Bogdanovic REBOUND (Off:0 Def:1)
Clint Capela,1,Capela REBOUND (Off:0 Def:2)
De'Andre Hunter,1,Hunter Free Throw 3 of 3 (3 PTS)
Dwayne Bacon,1,Bacon 1' Running Dunk (2 PTS) (Gordon 1 AST)
...,...,...
Karim Mane,4,STEAL
Nathan Knight,4,REBOUND
Robert Franks,4,Free Throw
Skylar Mays,4,Personal Take Foul


#### splitting data into periods and aggregation by players (in the period)


In [74]:
players_summarize_prompt = """
                    Aggregate the data for each of the players. Data: {data_period}.\
                        The following format is available:
    {{
        "Player": here are aggregated all the events in which the player participated  
    }}

"""

aggr_for_all_periods = [] 

#  processing data for each period
for period in tqdm(df_player_event.PERIOD.unique(), total=4): 
    data_period =df_player_event.query(f'PERIOD=={period}').to_csv()
    data_period = get_completion(players_summarize_prompt.format(data_period=data_period), model="gpt-3.5-turbo")
    aggr_for_all_periods.append(data_period)
    

100%|██████████| 4/4 [03:11<00:00, 47.93s/it]


#### Transfrom period data in format DataFrame

In [93]:
df_periods = pd.DataFrame([json.loads(data) for data in aggr_for_all_periods]).T.\
                        rename(columns={0:'period 1', 1:'period 2', 2:'period 3',3:'period 4'})


Unnamed: 0,period 1,period 2,period 3,period 4
Aaron Gordon,"[MISS Gordon 16' Pullup Jump Shot, Got a Rebou...",,,
Bogdan Bogdanovic,"[Bogdanovic REBOUND (Off:0 Def:1), Missed two ...","[Bogdanovic REBOUND (Off:1 Def:2), 7' Turnarou...","[SUB: Reddish FOR Bogdanovic, Bogdanovic Bad P...",[SUB: Reddish FOR Bogdanovic]
Clint Capela,"[Capela REBOUND (Off:0 Def:2), Made a Reverse ...",[Capela REBOUND (Off:4 Def:3)],[Capela REBOUND (Off:4 Def:6)],"[REBOUND (Off:4 Def:8), Bad Pass Turnover (P2...."
De'Andre Hunter,"[Hunter Free Throw 3 of 3 (3 PTS), Missed a 3P...","[MISS Hunter Free Throw 2 of 2, Hunter BLOCK (...","[Hunter REBOUND (Off:1 Def:2), SUB: Huerter FO...","[4' Driving Finger Roll Layup (15 PTS), 1' Cut..."
Dwayne Bacon,"[Bacon 1' Running Dunk (2 PTS) (Gordon 1 AST),...","[Out of Bounds Lost Ball Turnover (P1.T7), SUB...",[Vucevic 9' Turnaround Jump Shot (10 PTS) (Bac...,
Evan Fournier,"[Fournier P.FOUL (P1.T2) (S.Wall), Stole the b...","[Fournier 26' 3PT Pullup Jump Shot (11 PTS), V...","[MISS Fernando 5' Driving Layup, Jump Ball Fer...",
Markelle Fultz,"[Fultz S.FOUL (P1.T3) (S.Wall), Got a Rebound,...",[Fultz REBOUND (Off:0 Def:2)],[Fultz 1' Reverse Layup (9 PTS) (Vucevic 4 AST...,
Nikola Vucevic,"[MISS Vucevic 25' 3PT Jump Shot, Got a Rebound...",[Vucevic 13' Turnaround Jump Shot (2 PTS) (Gor...,[Vucevic 9' Turnaround Jump Shot (10 PTS) (Bac...,
Trae Young,"[Young P.FOUL (P2.T2) (S.Wall), Missed a Drivi...","[MISS Young 10' Floating Jump Shot, Out of Bou...","[Young Bad Pass Turnover (P4.T11), Vucevic STE...","[STEAL (1 STL), SUB: Goodwin FOR Young]"
Cam Reddish,"[Committed a Shooting Foul, Reddish 19' Pullup...","[SUB: Reddish FOR Capela, Reddish REBOUND (Off...","[MISS Reddish 3PT Jump Shot, Reddish 22' Step ...","[SUB: Reddish FOR Bogdanovic, 1' Cutting Layup..."


#### Aggregation of player data for the entire game

In [4]:

df_periods= pd.read_csv('output.csv').rename(columns={'Unnamed: 0':'player'})
       
total_summarize_prompt = f"""In this table, some player entries can be broken down into several entries. Make each line contain a unique player
Table: {df_periods.to_csv}
    Output:
    json 
    {{
        "Player": here are aggregated all the events in which the player participated  
    }}
    """

summaraize_data_all_periods = get_completion(total_summarize_prompt, model="gpt-3.5-turbo")

In [5]:
len(summaraize_data_all_periods)

5501

#### Fix Json

In [7]:
prompt =f"""
This json has an incorrect structure. Json: {summaraize_data_all_periods}. Correct this structure so that without errors you can make json.loads(data)
"""
json_correct = get_completion(total_summarize_prompt, model="gpt-3.5-turbo")

{
    "Aaron Gordon": [
        "MISS Gordon 16' Pullup Jump Shot",
        "Got a Rebound",
        "MISS Vucevic 25' 3PT Jump Shot",
        "Got a Rebound",
        "MISS 3PT Jump Shot",
        "Got a Rebound"
    ],
    "Bogdan Bogdanovic": [
        "Bogdanovic REBOUND (Off:0 Def:1)",
        "Missed the Shot",
        "Bogdanovic REBOUND (Off:1 Def:2)",
        "7' Turnaround Jump Shot (2 PTS)",
        "SUB: Reddish FOR Bogdanovic"
    ],
    "Clint Capela": [
        "Capela REBOUND (Off:0 Def:2)",
        "Made a Reverse Layup (2 PTS)",
        "Capela REBOUND (Off:4 Def:3)",
        "Capela REBOUND (Off:4 Def:6)",
        "REBOUND (Off:4 Def:8)"
    ],
    "De'Andre Hunter": [
        "Hunter Free Throw 3 of 3 (3 PTS)",
        "Missed a Free Throw",
        "Hunter BLOCK (1 BLK)",
        "Hunter REBOUND (Off:1 Def:2)",
        "SUB: Huerter FOR Hunter",
        "Hunter REBOUND (Off:1 Def:2)",
        "SUB: Huerter FOR Hunter",
        "Hunter REBOUND (Off:1 Def:2)",
      

In [14]:
df_periods

Unnamed: 0,player,period 1,period 2,period 3,period 4
0,Aaron Gordon,"[""MISS Gordon 16' Pullup Jump Shot"", 'Got a Re...",,,
1,Bogdan Bogdanovic,"['Bogdanovic REBOUND (Off:0 Def:1)', 'Missed t...","['Bogdanovic REBOUND (Off:1 Def:2)', ""7' Turna...","['SUB: Reddish FOR Bogdanovic', 'Bogdanovic Ba...",['SUB: Reddish FOR Bogdanovic']
2,Clint Capela,"['Capela REBOUND (Off:0 Def:2)', 'Made a Rever...",['Capela REBOUND (Off:4 Def:3)'],['Capela REBOUND (Off:4 Def:6)'],"['REBOUND (Off:4 Def:8)', 'Bad Pass Turnover (..."
3,De'Andre Hunter,"['Hunter Free Throw 3 of 3 (3 PTS)', 'Missed a...","['MISS Hunter Free Throw 2 of 2', 'Hunter BLOC...","['Hunter REBOUND (Off:1 Def:2)', 'SUB: Huerter...","[""4' Driving Finger Roll Layup (15 PTS)"", ""1' ..."
4,Dwayne Bacon,"[""Bacon 1' Running Dunk (2 PTS) (Gordon 1 AST)...","['Out of Bounds Lost Ball Turnover (P1.T7)', '...","[""Vucevic 9' Turnaround Jump Shot (10 PTS) (Ba...",
5,Evan Fournier,"['Fournier P.FOUL (P1.T2) (S.Wall)', 'Stole th...","[""Fournier 26' 3PT Pullup Jump Shot (11 PTS)"",...","[""MISS Fernando 5' Driving Layup"", 'Jump Ball ...",
6,Markelle Fultz,"['Fultz S.FOUL (P1.T3) (S.Wall)', 'Got a Rebou...",['Fultz REBOUND (Off:0 Def:2)'],"[""Fultz 1' Reverse Layup (9 PTS) (Vucevic 4 AS...",
7,Nikola Vucevic,"[""MISS Vucevic 25' 3PT Jump Shot"", 'Got a Rebo...","[""Vucevic 13' Turnaround Jump Shot (2 PTS) (Go...","[""Vucevic 9' Turnaround Jump Shot (10 PTS) (Ba...",
8,Trae Young,"['Young P.FOUL (P2.T2) (S.Wall)', 'Missed a Dr...","[""MISS Young 10' Floating Jump Shot"", 'Out of ...","['Young Bad Pass Turnover (P4.T11)', 'Vucevic ...","['STEAL (1 STL)', 'SUB: Goodwin FOR Young']"
9,Cam Reddish,"['Committed a Shooting Foul', ""Reddish 19' Pul...","['SUB: Reddish FOR Capela', 'Reddish REBOUND (...","['MISS Reddish 3PT Jump Shot', ""Reddish 22' St...","[""SUB: Reddish FOR Bogdanovic, 1' Cutting Layu..."
