In [16]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts import PromptTemplate 
from langchain_core.runnables import RunnableMap, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from dotenv import load_dotenv
import pandas as pd 

load_dotenv()

True

1. Load the Dataset and inspect it

In [17]:
df = pd.read_csv('dataset/serie_a_forwards.csv')
df.head()

Unnamed: 0,full_name,country_code,role,season,team,mins_played,xG,goals,xA,assists,xT,xT_from_passes,xT_from_carries,won_dribblings,attempted_dribblings,won_dribblings_ratio,offensive_aero
0,Zlatan Ibrahimovic,SWE,FW,2021,Milan,1008,6.252449,8.0,2.012261,3.0,0.794123,0.635827,0.158296,5.0,9.0,0.5556,68.9437
1,Fabio Quagliarella,ITA,FW,2021,Sampdoria,1658,4.259178,4.0,1.83709,3.0,0.63027,0.585217,0.045052,11.0,23.0,0.4783,38.1917
2,Cristiano Ronaldo,PRT,FW,2021,Juventus,31,0.236411,,0.028131,,-0.01824,-0.017597,-0.000644,1.0,1.0,1.0,
3,Goran Pandev,MKD,FW,2021,Genoa,835,0.464921,,1.186817,1.0,0.627832,0.349661,0.278171,10.0,22.0,0.4545,40.5377
4,Stefano Okaka,ITA,FW,2021,Udinese,20,0.033823,,0.001525,1.0,0.016907,0.002524,0.014382,,,,54.5864


In [18]:
df.columns

Index(['full_name', 'country_code', 'role', 'season', 'team', 'mins_played',
       'xG', 'goals', 'xA', 'assists', 'xT', 'xT_from_passes',
       'xT_from_carries', 'won_dribblings', 'attempted_dribblings',
       'won_dribblings_ratio', 'offensive_aero'],
      dtype='object')

2. Data preprocessing. Here you can add what you think might be useful to have a better seasonal analysis. Examples:
 - over / under performance indicators
 - normalizations
 - percentiles 
 - rankings


In [19]:
#calculate percentile of stats vs players in the same season
cols_to_pct = ['xG', 'goals', 'xA', 'assists', 'xT', 'xT_from_passes',
       'xT_from_carries', 'won_dribblings', 'attempted_dribblings',
       'won_dribblings_ratio', 'offensive_aero']

for stat in cols_to_pct:
    df[stat + '_percentile'] = df.groupby('season')[stat].rank(pct=True)


float_cols = df.select_dtypes(include=['float']).columns
df[float_cols] = df[float_cols].round(2)

In [20]:
(   df
    .query('season == 2023')
    .sort_values('xG', ascending=False)
    [['full_name', 'season','xG', 'xG_percentile', 'goals', 'goals_percentile']]
    .head(10)
)

Unnamed: 0,full_name,season,xG,xG_percentile,goals,goals_percentile
496,Lautaro Martínez,2023,15.59,1.0,23.0,1.0
545,Dusan Vlahovic,2023,15.21,0.99,15.0,0.99
495,Marcus Thuram,2023,12.5,0.99,10.0,0.92
564,Olivier Giroud,2023,11.15,0.98,12.0,0.98
578,Romelu Lukaku,2023,10.32,0.98,10.0,0.92
575,Khvicha Kvaratskhelia,2023,10.26,0.97,10.0,0.92
489,Victor Osimhen,2023,10.07,0.96,11.0,0.96
532,Ciro Immobile,2023,9.62,0.96,6.0,0.78
570,Paulo Dybala,2023,9.4,0.95,12.0,0.98
574,Nikola Krstovic,2023,9.24,0.94,5.0,0.72



3. Write a prompt template to extract the informations you need, so player full name and season indication. 

Your goal here is to have the model generating a structured output, from which you can parse the relevant information (dict, list, comma separated string...)

Optional: To have a more flexible app that could also compare players, think of way to extract multiple players at once


In [21]:
PLAYER_INFO_TEMPLATE = '''
You are a words extractor from a given input. 
Your job is to extract words following instructions and providing them in a specific format.
Given a user question you must extract:
- Player full names
- Seasons

Provide the output as a list of dictionaries, where each pair player-season is a dict.
Follow this structure: 
[
        {{'full_name': '<player_name_1>' (str), 'season': <season_value_1> (int)}},
        ...{{'full_name': '<player_name_N>' (str), 'season': <season_value_N> (int)}}]

The season indication is a 4 digits number, indicating the year of the first matchday (e.g. 2020, 2021, 2022 and so on).
Note that the current season is 2023.

Example: Paulo Dybala seasonal performance in 2022. 
Output: [{{'full_name': 'Paulo Dybala', 'season': 2022}}]

Here is the input to process: 
{input}
'''


player_info_template = PromptTemplate(
        template=PLAYER_INFO_TEMPLATE, 
        input_variables=['input']
        )


llm = ChatOpenAI(model='gpt-3.5-turbo', timeout=4, max_retries=3, temperature=0)

info_chain = player_info_template | llm 


4. Test the extraction step! Look at the output and then find a way to parse it into a python data structure

In [22]:
input = 'Compare the seasons of Paulo Dybala in 2022 and Dusan Vlahovic in 2021'
result = info_chain.invoke({'input':input})
result.content
res = eval(result.content)

In [23]:
print(res, type(res), type(res[0]))

[{'full_name': 'Paulo Dybala', 'season': 2022}, {'full_name': 'Dusan Vlahovic', 'season': 2021}] <class 'list'> <class 'dict'>


5. Filter the player you want to analyze (select the correct season!) from the dataframe

In [24]:
players_to_filter = pd.DataFrame(res)
players_to_filter


Unnamed: 0,full_name,season
0,Paulo Dybala,2022
1,Dusan Vlahovic,2021


In [25]:
filtered_df = pd.merge(df, players_to_filter, on=['full_name', 'season'], how='inner')

6. Check your filtered df

In [26]:
filtered_df

Unnamed: 0,full_name,country_code,role,season,team,mins_played,xG,goals,xA,assists,...,goals_percentile,xA_percentile,assists_percentile,xT_percentile,xT_from_passes_percentile,xT_from_carries_percentile,won_dribblings_percentile,attempted_dribblings_percentile,won_dribblings_ratio_percentile,offensive_aero_percentile
0,Dusan Vlahovic,SRB,FW,2021,Fiorentina,1861,12.05,17.0,2.75,2.0,...,0.97,0.82,0.44,0.6,0.64,0.6,0.63,0.62,0.6,0.95
1,Dusan Vlahovic,SRB,FW,2021,Juventus,1077,5.67,7.0,0.39,1.0,...,0.68,0.39,0.16,0.45,0.13,0.51,0.43,0.54,0.17,0.9
2,Paulo Dybala,ARG,FW,2022,Roma,1751,9.69,12.0,4.08,6.0,...,0.91,0.92,0.91,0.93,0.92,0.92,0.78,0.76,0.56,0.06


7. Transform your filtered dataframe into a list of dict (Hint: .to_dict(orient='records'))

In [27]:
players_data = filtered_df.to_dict(orient='records')

8. Write a prompt template to generate the final report given:
- user input
- extracted data
- metrics description (you can load them from  the script utils/descriptions.py)

Use the prompt techniques described into the lecture if you think they might fit. There is no right or wrong solution here.
Prompt engineering is a trial and error process, try different things and be creative!

In [28]:
from utils.descriptions import Descriptions
OUTPUT_TEMPLATE = '''
You are a soccer data analyst. You are provided with a user input and useful advanced metrics to answer the question.
If multiple players are provided, try to perform a comparison between them.

User input: {input}
Data: {data}

Use these definitions to better understand the metrics you are provided: 
{stat_description}

If a player has played in multiple teams in the same season, try compare the two situations when possible. 
Make your analysis as a report. 
Divide your analysis in two parts:

1. Creation: talk about assists and xA, xT and dribblings skill when notable
2. Finishing: talk about goals and xG and how they compare (over / under performance?) and about areial dominance (use AERO elo value!)
3. Overall conclusion, tell who is better overall in case of comparisons 

Make it a coherent text not a bullet point list. Put a title, and paragraph names well formatted.
'''


output_template = PromptTemplate(
        template=OUTPUT_TEMPLATE, 
        input_variables = ['input', 'data'], 
        partial_variables={'stat_description': Descriptions.stats}
        )

llm = ChatOpenAI(model='gpt-3.5-turbo', timeout=30, max_retries=3, temperature=0)

output_chain = output_template | llm 

9. Invoke the chain and see results!

In [29]:

report = output_chain.invoke(
    {
        'input': input, 
        'data': players_data,
        }
    )

In [30]:
from pprint import pprint
pprint(report.content)

('**Analysis of Paulo Dybala in 2022 vs Dusan Vlahovic in 2021**\n'
 '\n'
 '**Creation Analysis:**\n'
 'When comparing the creation abilities of Paulo Dybala in 2022 and Dusan '
 'Vlahovic in 2021, we can see that Dybala had a higher xA (Expected Assists) '
 "value of 4.08 compared to Vlahovic's xA of 0.39. This indicates that Dybala "
 'was more involved in creating goal-scoring opportunities for his team '
 "through his passes and chances created. Additionally, Dybala's xT (Expected "
 "Threat) value of 2.27 was significantly higher than Vlahovic's xT of 0.17, "
 "showing that Dybala's actions from carries and passes were more threatening "
 "in terms of increasing his team's scoring probability. Dybala also had a "
 'higher number of successful dribblings with a ratio of 0.5 compared to '
 "Vlahovic's ratio of 0.41, showcasing Dybala's skill in beating opponents in "
 'one-on-one situations.\n'
 '\n'
 '**Finishing Analysis:**\n'
 'In terms of finishing, Dybala scored 12 goals with a

10. Make all togheter in a single chain

In [31]:
def parse_player_info(extracted_dict)->list:
    return eval(extracted_dict.content)

def filter_and_format_data(extracted_dict: dict)->dict:
    players_to_filter = pd.DataFrame(extracted_dict)
    filtered_df = pd.merge(df, players_to_filter, on=['full_name', 'season'], how='inner')
    return filtered_df.to_dict(orient='records')
    

In [32]:
seasonal_report_chain = (RunnableMap(
            steps={
                    "input": lambda x: x['input'],
                    "data": (
                        player_info_template 
                        | llm 
                        | parse_player_info 
                        | filter_and_format_data
                    ),
                })
            | RunnableLambda(lambda x: x['steps'])
            | output_template
            | llm 
            )


In [33]:
res = seasonal_report_chain.invoke({'input': "Compare the seasonal performance of Cristiano Ronaldo and Romelu Lukaku in 2020"}, tags='test')
pprint(res.content)

("**Analysis of Cristiano Ronaldo and Romelu Lukaku's Seasonal Performance in "
 '2020**\n'
 '\n'
 '**Creation:**\n'
 'When it comes to creating goal-scoring opportunities, Romelu Lukaku had a '
 'standout season in 2020. He recorded 11 assists, showcasing his ability to '
 "not only score goals but also set up his teammates effectively. Lukaku's xA "
 '(Expected Assists) value of 6.34 indicates that he was involved in creating '
 'high-quality chances for his team.\n'
 '\n'
 'On the other hand, Cristiano Ronaldo also had a solid season in terms of '
 'creation. While he recorded fewer assists (3) compared to Lukaku, his xA '
 'value of 3.42 suggests that he was also effective in setting up goal-scoring '
 "opportunities for his teammates. Ronaldo's xT (Expected Threat) value of "
 '3.81 indicates that his actions, whether from passes or carries, contributed '
 "significantly to his team's scoring probability.\n"
 '\n'
 'In terms of dribbling skills, both players had a similar success 