In [1]:
%load_ext autoreload
%autoreload 2

## Generating summaries

In [2]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split, get_sample_weights, get_eval_set
from src.preprocessing import preprocess_data
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, AutoModelForCausalLM
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, roc_auc_score
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
import transformers
import re
from src.utils import aggregate_samples, evaluate_model, compute_class_weights, remove_hashtag_links, get_first_texts
from torch.optim.lr_scheduler import ReduceLROnPlateau
import ast

import asyncio
from openai import OpenAI

tqdm.pandas()

import nest_asyncio
nest_asyncio.apply()

In [3]:
train_data, test_data = train_test_split()


100%|██████████| 16/16 [00:08<00:00,  1.87it/s]


In [4]:
df = pd.concat(train_data)

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", )
df['tokens'] = df['Tweet'].progress_apply(tokenizer.tokenize)

target_words = [
    "goal", "penalty", "halftime", "full-time", "yellow", "red",
    "kickoff", "extra time", "stoppage time", "foul", "offside", "handball",
    "save", "tackle", "dribble", "corner", "substitution", "header",
    "free kick", "throw-in", "assist", "hat-trick", "own goal", "victory",
    "defeat", "draw", "win", "loss", "tie", "comeback", "goalkeeper",
    "striker", "midfielder", "defender", "referee", "fans", "var", "gooal"
]
target_words = set(tokenizer.tokenize(" ".join(target_words)))

def is_valid_text(t):
    for w in t:
        if w in target_words:
            return True
        
    return False

df['is_valid']= df['tokens'].progress_apply(is_valid_text)
# df['lan'] = df['Tweet'].progress_apply(lambda x : langid.classify(x)[0])

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 1472980/1472980 [01:24<00:00, 17360.61it/s]
100%|██████████| 1472980/1472980 [00:01<00:00, 1001996.15it/s]


In [5]:
valid_df = df.query("is_valid == 1")

## Checking timestamp

In [6]:
first_ts = valid_df.groupby(["MatchID", "PeriodID"]).Timestamp.min()
last_ts = valid_df.groupby(["MatchID", "PeriodID"]).Timestamp.max()

In [8]:
possible_indices = set(train_data.keys())

test_indices = list(np.random.choice(list(possible_indices), size=3, replace = False,))
test_indices = [13,1,18]
all_train_indices = list(possible_indices.difference(set(test_indices)))
val_indices = [1,5,12,19]
# val_indices = list(np.random.choice(all_train_indices, 3, replace=False))
# train_indices = list(set(all_train_indices).difference(set(val_indices)))
# train_indices = [0,2,7,11,13,18]



train_df = aggregate_samples(valid_df, list(possible_indices), max_tweet_size = 10)
test_df = aggregate_samples(valid_df, test_indices, max_tweet_size = 10)
val_df = aggregate_samples(valid_df, val_indices, max_tweet_size=10)

train_df = remove_hashtag_links(train_df)
test_df = remove_hashtag_links(test_df)
val_df = remove_hashtag_links(val_df)

  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({
  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({
  return (df.query(f"MatchID in {indices}")).groupby(["MatchID", "PeriodID"]).agg({


In [10]:
base_prompt = ''' 


You are a helpful AI tasked with analyzing a collection of tweets posted during a single minute of a football match. Your goal is to generate a concise summary of the key events that occurred during this time and to specifically answer whether any of the following events occurred: 

1. A goal (including who scored, if mentioned).
2. A yellow or red card (including the player or team, if mentioned).
3. A kickoff (start of a half or after a goal).
4. Halftime or fulltime whistle.

Here are the tweets:

{tweets}

### Instructions:
1. Analyze the tweets for clear indications of the above events using common football-related terminology, phrases, or hashtags.
2. If the event is ambiguous or not explicitly stated in the tweets, mark it as "Not mentioned."
3. Summarize any additional significant match events or fan reactions from the tweets that are relevant to understanding the minute.

Return a detailed summary of the events, and mention if there were any cards, goals halftime or kickoff in that match. Be as precise as possible and say the name of the player if possible
Try to reason as much as possible and avoid simple and quick answers

'''

In [11]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [12]:
client = OpenAI(
    api_key="E816LUcagzqnrKK49x99jFqzVlrEDKqr",
    base_url="https://api.deepinfra.com/v1/openai",
)


In [13]:
client = OpenAI(
    api_key="E816LUcagzqnrKK49x99jFqzVlrEDKqr",
    base_url="https://api.deepinfra.com/v1/openai",
)

async def create_completion(prompt, model_name, stream):
    try:
        chat_completion = await asyncio.to_thread(
            client.chat.completions.create,
            model=model_name,
            # prompt = prompt,
            messages=[{"role": "user", "content": prompt}],
            stream=stream,
        )

        if stream:
            for event in chat_completion:
                if event.choices[0].finish_reason:
                    print(
                        event.choices[0].finish_reason,
                        event.usage['prompt_tokens'],
                        event.usage['completion_tokens']
                    )
                else:
                    print(event.choices[0].delta.content)
            return None  # Return None for stream as the output is handled by the print statements
        else:
            generated_text = chat_completion.choices[0].message.content
            prompt_tokens = chat_completion.usage.prompt_tokens
            output_tokens = chat_completion.usage.completion_tokens
            return {
                "generated_text": generated_text,
                "prompt_tokens": prompt_tokens,
                "output_tokens": output_tokens
            }
    except Exception as e:
        print(f"Error during OpenAI API request: {e}")
        return None

async def process_completions(study_df, model_name, stream, batch_size=1_000):
    generated_texts = []
    
    tasks = []
    for i, (text_idx, row) in enumerate(study_df.iterrows()):
        prompt = base_prompt.format(tweets=row['Tweet'])
        tasks.append((i, text_idx, row, create_completion(prompt, model_name, stream)))

    for batch_start in range(0, len(tasks), batch_size):
        batch = tasks[batch_start:batch_start + batch_size]
        results = await asyncio.gather(*[task[3] for task in batch])
        
        for (i, text_idx, row, _), result in zip(batch, results):
            if result is not None:
                generated_text = result['generated_text']
                prompt_tokens = result['prompt_tokens']
                output_tokens = result['output_tokens']
                
                clear_output(wait=True)
                print(
                    f"""
                    Generation of text index = {i}
                    Generated text = {generated_text}
                    Number of prompt tokens = {prompt_tokens}
                    Number of output tokens = {output_tokens}
                    """
                )

                generated_texts.append({
                    "generated_text": generated_text,
                    "text": row['Tweet'],
                    "text_idx": text_idx,
                    "label": row["EventType"]
                })

    return generated_texts

async def main():
    # Define your study_df, model_name, stream, and other necessary variables
    stream = False  # Change to True if you want streaming

    # Run the process_completions coroutine
    generated_texts = await process_completions(train_df, model_name, stream, batch_size=1000)
    return generated_texts

# Run the async main function
generated_texts = asyncio.run(main())


                    Generation of text index = 2136
                    Generated text = After analyzing the tweets, I've identified the following key events that occurred during the single minute of the football match:

1. **Goal**: Yes, multiple goals were mentioned in the tweets. The scores mentioned are:
	* 3-1 (twice)
	* 1-3
This suggests that the team that was trailing 1-3 eventually scored a goal to bring the score to 3-1, but the original score of 3-1 is unclear.
2. **Yellow or Red Card**: Not mentioned.
3. **Kickoff**: Not mentioned. The tweets seem to be referring to a continuation of the match rather than the start of a new half.
4. **Halftime or Fulltime Whistle**: Not mentioned.

Additionally, significant fan reactions and match events that can be inferred from the tweets are:

* The team scored multiple goals, with the final score being 3-1 (at least initially).
* The team's goalkeeper, Ochoa, was commended for their performance ( twitter post "Ochoa lost his virginity t

In [16]:
pd.DataFrame(generated_texts).to_pickle("summary.pkl")

In [None]:
generated_texts['preds'] = generated_texts['generated_text'].str.lower().str.contains("yes").astype(int)

In [None]:
generated_texts['text_idx'] = generated_texts['text_idx'].apply(ast.literal_eval)

In [None]:
generated_texts['match_id'] = generated_texts['text_idx'].apply(lambda x: x[0])
generated_texts['period_id'] = generated_texts['text_idx'].apply(lambda x: x[1])



In [None]:
generated_texts.groupby("match_id").apply(lambda x: accuracy_score(x['label'], x['preds']))

In [None]:
print(generated_texts.query("match_id == 1")['generated_text'].iloc[69])

In [None]:
generated_texts.set_index("period_id").query("match_id == 1")[['label', 'preds']].plot()

### Generating predictions

In [None]:
eval_df = get_eval_set()

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
eval_df['tokens'] = eval_df['Tweet'].progress_apply(tokenizer.tokenize)

target_words = [
    "goal", "penalty", "halftime", "full-time", "yellow", "red",
    "kickoff", "extra time", "stoppage time", "foul", "offside", "handball",
    "save", "tackle", "dribble", "corner", "substitution", "header",
    "free kick", "throw-in", "assist", "hat-trick", "own goal", "victory",
    "defeat", "draw", "win", "loss", "tie", "comeback", "goalkeeper",
    "striker", "midfielder", "defender", "referee", "fans", "var", "gooal"
]
target_words = set(tokenizer.tokenize(" ".join(target_words)))

def is_valid_text(t):
    for w in t:
        if w in target_words:
            return True
        
    return False

eval_df['is_valid']= eval_df['tokens'].progress_apply(is_valid_text)
# df['lan'] = df['Tweet'].progress_apply(lambda x : langid.cl

In [None]:
eval_df['EventType'] = -1

In [None]:
preprocessed_eval_df = aggregate_samples(eval_df.query("is_valid == 1"), eval_df.MatchID.unique().tolist(), max_size=10)

preprocessed_eval_df = remove_hashtag_links(preprocessed_eval_df)

In [None]:
preprocessed_eval_df

In [None]:
async def main():
    # Define your study_df, model_name, stream, and other necessary variables
    stream = False  # Change to True if you want streaming

    # Run the process_completions coroutine
    generated_texts = await process_completions(preprocessed_eval_df, model_name, stream, batch_size=1000)
    return generated_texts

# Run the async main function
generated_texts = asyncio.run(main())

In [None]:
eval_df_generated = pd.DataFrame(generated_texts)

In [None]:
idx = np.random.randint(0, len(eval_df_generated))
print(eval_df_generated['text'].iloc[idx])

In [None]:
print(eval_df_generated['generated_text'].iloc[idx])

In [None]:
eval_df_generated['EventType'] = eval_df_generated['generated_text']\
    .str\
    .lower()\
    .str\
    .contains("yes")\
    .astype(int)

In [None]:
eval_df_generated['MatchID'] = eval_df_generated['text_idx'].apply(lambda x : x[0])
eval_df_generated['PeriodID'] = eval_df_generated['text_idx'].apply(lambda x : x[1])

In [None]:
x = eval_df.drop_duplicates("ID")[['ID', "MatchID", "PeriodID"]]

In [None]:
pd.merge(
    eval_df_generated,
    x,
    on = ['MatchID', "PeriodID"]
    
)[["ID", "EventType"]]\
    .set_index("ID")\
    .to_csv("predictions_3.csv")

In [None]:
eval_df_generated['EventType'].value_counts(normalize=True)

In [None]:
df = pd.read_csv("generated_summary_llm_2.csv", index_col = 0)

In [None]:
import ast
df['text_idx']= df['text_idx'].apply(ast.literal_eval)

In [None]:
df['prediction'] = df['generated_text'].str.lower().str.contains("yes")

In [41]:
idx = np.random.randint(0, len(df.query("label == 0 and prediction == 1")))
print(df.query("label == 0 and prediction == 1")['text_idx'].iloc[idx])
print(df.query("label == 0 and prediction == 1")['generated_text'].iloc[idx])

(10, 60)
After analyzing the tweets, I have the following summary:

```
{
    "summary": "The minute saw a series of intense matches between Germany and Argentina, with both teams showing significant aggression and skill. Argentina's goalkeeper was praised for their good game, while Germany's fans expressed frustration with their team's performance. At the end of the minute, the score was still 0-0, but fans were already anticipating a goal.",
    "goal": "no",
    "cards": "yes",
    "kickoff": "no",
    "halftime": "yes",
    "fulltime": "no"
}
```

Here's the explanation for each event:

1. **Goal**: There is no clear indication of a goal scored in that exact minute. Some tweets mention that Argentina "nicked it" or that Germany "was so close", but these are speculative and do not explicitly mention a goal. [Tweet that refers to the goal: None]

2. **Cards**: There is a mention of "foul trouble" for Germany, which implies that there were some yellow cards given out, but this does no

In [38]:
print(df.query("label == 0 and prediction == 1")['text_idx'].iloc[idx])
print(df.query("label == 0 and prediction == 1")['text'].iloc[idx])

(17, 13)
Welcome back Pepe! 
Also   
Let's go  !!!!
good luck for 
Let's go  !
  -  
LET'S GO  AND !!!
Switching over to  
 &  lets goooooooo!
do not disappoint me 
Kick off :  v 
 &  all fucking day
I'm With   & 
Do or die!   
I'm prediction -2 -1
 and -3 -1
Its do or die lets go! 
 vs  
 vs  
nice and sunny for  - 
ima need  to beat  today!!
One  goal and we can relax
THE BAE HES SO BEAUTIFYL  
Games underway
 vs 
 vs   
"0" kick off  0-0  Live TVONE"
If only  had a World Class Striker..!!
For   back in the first 11.

Come on  !! Min 2-0!! Let's goo!!
lmao no one on the left side 
eder is awful why is he there 
Watching  V  ..Hoping for nothing :(
We can do this, I know we can 
Which channel is the  vs  match on?

I really want  n  to win today
hope  qualified for the next round ! 
I may be the only person watching  
hoy voy con    y  
Game on ! Btw   |  
Let's go  we need the win, 3-0 
I hope  wins and I hope  wins
