In [1]:
import json
from config.fastf1 import fastf1
from pathlib import Path
import pandas as pd
from pandas.io.formats.style import Styler
from collections.abc import Generator, Callable
import typing
from typing import Any, TypeAlias
import numpy as np
from contextlib import contextmanager
from functools import partial, reduce
import re
import datetime as dt
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

import random
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

import importlib

from config import config
importlib.reload(config);

import src.utils
importlib.reload(src.utils);
from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

In [None]:
n = 2
display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

In [5]:
def concatenate_submissions_and_comments(submissions_df: pd.DataFrame, comments_df: pd.DataFrame) -> pd.DataFrame:
    _submissions_df = submissions_df.copy()

    titles: pd.Series[str] = _submissions_df['title'].str.rstrip()
    selftexts: pd.Series[str] = _submissions_df['selftext']
    alphanumeric_pattern = re.compile(r'\w')

    # Concatenate submission's title and selftext into a single text for NLP analysis
    # TODO: still a bit buggy: title='title', selftext='' -> text='title. ' with trailing space
    _submissions_df['text'] = np.where(
        titles.str[-1].map(lambda ch: alphanumeric_pattern.match(ch) is not None),
        titles + '. ' + selftexts,
        titles + ' ' + selftexts,
    )
    _submissions_df.drop(columns=['title', 'selftext'], inplace=True)

    _comments_df = comments_df.copy()
    _comments_df.rename(columns={'body': 'text'}, inplace=True)

    df = pd.concat((_submissions_df, _comments_df), ignore_index=True)  
    return df

f1_df = concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

In [None]:
display(f1_df.head(), len(f1_df))
display(f15_df.head(), len(f15_df))

# Baseline: Rule-Based Prediction Extraction

In [7]:
__load_ndjson = partial(_load_ndjson, sample_size=5000)

df = concatenate_submissions_and_comments(
    _load_submissions_df(RAW_FORMULA1_SUBMISSIONS_FILE, __load_ndjson),
    _load_comments_df(RAW_FORMULA1_COMMENTS_FILE, __load_ndjson),
)

with open('f1_random_sample_texts.txt', 'w', encoding='utf-8') as file:
    print(*map(json.dumps, df['text']), sep='\n', file=file)

# Pre-trained model: BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

# Load data
data = pd.read_csv("f1_random_sample_texts.txt", sep="\n", header=None, names=["text"])
data["label"] = 0  # Assign labels (e.g., 0 = non-winner, 1 = winner)
# Ensure labels align with discussions about drivers (manual or inferred labeling)

# Preprocess
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
data["input_ids"] = data["text"].apply(
    lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=128, return_tensors="pt")["input_ids"]
)
data["attention_mask"] = data["text"].apply(
    lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=128, return_tensors="pt")["attention_mask"]
)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data[["input_ids", "attention_mask"]].values,
    data["label"].values,
    test_size=0.2,
    random_state=42
)

# Prepare datasets
class F1Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.texts[idx][0],
            "attention_mask": self.texts[idx][1],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_dataset = F1Dataset(train_texts, train_labels)
val_dataset = F1Dataset(val_texts, val_labels)

# Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=20)  # 20 drivers

# Training
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()


In [30]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_name = "GroNLP/hateBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# Load the fine-tuned model and tokenizer
model_name = "path_to_your_fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to classify text
def classify_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Perform inference
    outputs = model(**inputs)
    # Apply softmax to get probabilities
    probs = F.softmax(outputs.logits, dim=-1)
    # Get the predicted class
    predicted_class = torch.argmax(probs, dim=-1).item()
    # Map the predicted class to label
    label_map = {0: "Not Prediction", 1: "Prediction"}
    return label_map[predicted_class], probs

# Example usage
text = "I think the race will end with car number 7 in the lead."
label, probabilities = classify_text(text)
print(f"Label: {label}, Probabilities: {probabilities}")


# Predictive power evaluation metrics

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# model_name = "GroNLP/hateBERT"
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


# # Load the fine-tuned model and tokenizer
# model_name = "path_to_your_fine_tuned_model"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function to classify text
def classify_text(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Perform inference
    outputs = model(**inputs)
    # Apply softmax to get probabilities
    probs = F.softmax(outputs.logits, dim=-1)
    # Get the predicted class
    predicted_class = torch.argmax(probs, dim=-1).item()
    # Map the predicted class to label
    label_map = {0: "Not Prediction", 1: "Prediction"}
    return label_map[predicted_class], probs

# Example usage
text = "I think the race will end with car number 7 in the lead."
label, probabilities = classify_text(text)
print(f"Label: {label}, Probabilities: {probabilities}")


# Predictive power evaluation metrics

In [None]:
full_schedule = fastf1.get_event_schedule(config.Dataset.YEAR)
with display_full_dataframe():
    display(full_schedule)

In [34]:
schedule = full_schedule[
    (full_schedule['EventDate'] >= config.Dataset.START_DATE) &
    (full_schedule['EventDate'] <= config.Dataset.END_DATE) &
    (full_schedule['EventFormat'] == 'conventional') # TODO: Skip sprint weekends for now. Also include sprint weekends later
]
display(schedule)

In [None]:
# TODO: Example
posts_df = f1_df
event = schedule.iloc[-1]
first_post_at = typing.cast(dt.datetime, event['Session1DateUtc']) - dt.timedelta(days=1)
last_post_at = typing.cast(dt.datetime, event['Session5DateUtc'])
posts_df = posts_df[
    (posts_df['created_utc'] >= first_post_at) &
    (posts_df['created_utc'] <= last_post_at)
]

with display_full_dataframe():
    display(posts_df.head(8), len(posts_df))

race_session = event.get_session('Race')
race_session.load(laps=False, telemetry=False, weather=False, messages=False)

with display_full_dataframe():
    top20 = pd.DataFrame(
        {
            'Pos': range(1, 21),
            'DriverFullName': race_session.results['FullName'],
        }
    )
    display(hide_index(top20))

for index, event in enumerate(schedule):
    last_five_events = schedule[index-5:index]

In [None]:

for _, event in schedule.iterrows():
    event_name = event['EventName']
    event_date = event['EventDate']
    print(f"Event: {event_name} on {event_date}")

    # Retrieve session start times
    for session_name in ['Practice 1', 'Practice 2', 'Practice 3', 'Qualifying', 'Race']:
        session = event.get_session(session_name)
        if session is not None:
            print(f"  {session_name} starts at {session.date}")

    # Load race session to get results
    race = event.get_session('Race')
    if race is not None:
        race.load()
        results = race.results
        # Display top 20 results
        top_20 = results.head(20)
        print("  Top 20 Race Results:")
        for position, driver in top_20.iterrows():
            print(f"    Position {position + 1}: {driver['FullName']} ({driver['TeamName']})")

In [None]:
from transformers import pipeline

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

text = "Max verstappen is going to win"
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_task = pipeline("sentiment-analysis", model=MODEL, tokenizer=MODEL)
sentiment_task(text)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model_config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(config.MODELS_DIR / 'sentiment_model.pt')
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
print(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
print(ranking)
for i in range(scores.shape[0]):
    l = model_config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


In [None]:
print(encoded_input)

In [None]:
import sys
import urllib.request
import spacy
from symspellpy import SymSpell, Verbosity

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

# Initialize SymSpell
MAX_DICTIONARY_EDIT_DISTANCE = 4
sym_spell = SymSpell(max_dictionary_edit_distance=MAX_DICTIONARY_EDIT_DISTANCE, prefix_length=7)



def download_file(path, url):
    if not path.exists():
        try:
            print('INFO: downloading english word dictionary...')
            urllib.request.urlretrieve(url, path)
            print('downloading complete!!! :)')
        except Exception as error:
            raise Exception('Download failed: {error}')


# english_words_dictionary_file = config.DATA_DIR / 'english_words_dictionary.txt'
# download_file(english_words_dictionary_file, 'https://raw.githubusercontent.com/wolfgarbe/SymSpell/refs/heads/master/SymSpell/frequency_bigramdictionary_en_243_342.txt')

english_words_dictionary_file = config.DATA_DIR / 'english_words_dictionary.txt'
download_file(english_words_dictionary_file, 'https://raw.githubusercontent.com/wolfgarbe/SymSpell/refs/heads/master/SymSpell/frequency_dictionary_en_82_765.txt')

with open(english_words_dictionary_file, 'r', encoding='utf-8') as file:
    for line in file:
        word, frequency = line.strip().split()
        frequency = int(frequency)
        sym_spell.create_dictionary_entry(word, frequency)


    # *phrase, frequency = line.rsplit(" ", 1)
    # phrase = " ".join(phrase)  # Reconstruct the phrase
    # frequency = int(frequency)  # Convert frequency to an integer

for word in F1_VOCABULARY:
    sym_spell.create_dictionary_entry(word, sys.maxsize)

# Function to correct spelling
def correct_spelling_symspell(word):
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=MAX_DICTIONARY_EDIT_DISTANCE)
    return suggestions[0].term if suggestions else word

# Function to correct spelling in a sentence using spaCy
def correct_spelling_in_text_spacy(text):
    """
    Correct spelling of words in a text while preserving the structure.
    Non-alphabetic tokens like punctuation remain unchanged.
    """
    doc = nlp(text)
    corrected_tokens = [
        # correct_spelling_symspell(token.text) if token.ent_type_ == 'PERSON' else token.text
        correct_spelling_symspell(token.text) if token.is_alpha else token.text
        for token in doc
    ]
    combined_names = [
        combine_names(corrected_tokens)
    ]

    return ''.join([
        corrected_tokens[i] + (token.whitespace_ if token.whitespace_ else '')
        for i, token in enumerate(doc)
    ])

In [None]:
correct_spelling_in_text_spacy('Mx Verstappening and Charls Lecerc are, just like this, a very good (/bad) example of drivers... and, voilà!')

In [None]:
driver_pattern = re.compile(r'Verstappen|Hamilton|Leclerc')

filtered_f1_df = f1_df[f1_df["text"].str.contains(driver_pattern)]

with display_full_dataframe():
    display(filtered_f1_df.head(3))
    
filtered_f1_df['text'] = filtered_f1_df['text'].apply(correct_spelling_in_text_spacy)

with display_full_dataframe():
    display(filtered_f1_df.head(3))
    print(len(filtered_f1_df.index))

In [None]:
driver_scores = {"Verstappen": [], "Hamilton": [], "Leclerc": []}

tokens = text.split()
corrected_tokens = [correct_spelling_symspell(word) for word in tokens]
preprocessed_text = " ".join(combined_tokens)

for text in tqdm(filtered_f1_df["text"]):
    # print(len(text))
    if len(text) > 1493:
        continue
    sentiment_result = sentiment_task(text)
    # print(sentiment_result)
    for driver in driver_scores.keys():
        if driver in text:
            driver_scores[driver].append(sentiment_result[0]['score'])

# Aggregate scores
final_scores = {driver: np.mean(scores) for driver, scores in driver_scores.items()}

In [None]:
print(driver_scores)
print(final_scores)

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from scipy.special import softmax

# Load the ABSA model and tokenizer
model_name = "yangheng/deberta-v3-base-absa-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)


# comments = ['Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!']
# driver_list = ['Sainz', 'verstappening', 'BOT', 'NOR', 'Bert', 'Ernie']
comments = ['Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, Max Verstappen will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!']
driver_list = ['Carlos Sainz', 'Max Verstappen']

results = {driver: {"positive": 0.0, "neutral": 0.0, "negative": 0.0, "count": 0} for driver in driver_list}

for comment in comments:
    found_drivers = [driver for driver in driver_list if driver in comment]
    
    for aspect in found_drivers:
        inputs = tokenizer(comment, aspect, return_tensors="pt", truncation=True, padding=True)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        scores = outputs.logits[0].numpy()
        probabilities = softmax(scores)

        results[aspect]["positive"] += probabilities[2]
        results[aspect]["neutral"] += probabilities[1]
        results[aspect]["negative"] += probabilities[0]
        results[aspect]["count"] += 1

for driver, sentiment in results.items():
    if sentiment["count"] > 0:
        sentiment["positive"] /= sentiment["count"]
        sentiment["neutral"] /= sentiment["count"]
        sentiment["negative"] /= sentiment["count"]

for driver, sentiment in results.items():
    if sentiment["count"] > 0:
        print(f"{driver}: [Positive: {sentiment['positive']:.4f}, Neutral: {sentiment['neutral']:.4f}, Negative: {sentiment['negative']:.4f}]")
    else:
        print(f"{driver}: No mentions found.")

In [None]:
final_scores = []
for driver, sentiment in results.items():
    if sentiment["count"] > 0:
        sentiment_score = (sentiment["positive"] - sentiment["negative"])
        final_scores.append((driver.title(), sentiment_score))

# Sort drivers by positive - negative score (descending order)
final_scores.sort(key=lambda x: x[1], reverse=True)

# Print sorted results
print("Drivers ranked by (positive - negative score):")
for driver, score in final_scores:
    print(f"{driver}: {score:.4f}")


In [None]:
# TODO: load last 5 races

historical_data = top20
historical_scores = {row["DriverFullName"]: 0 for _, row in historical_data.iterrows()}

for _ in range(5):

    historical_data = top20

    for _, row in historical_data.iterrows():
        historical_scores[row["DriverFullName"]] += 1 - ((row["Pos"] - 1) / 19) * 2

for driver, score in historical_scores.items():
    historical_scores[driver] = score / 5

final_scores_dict = dict(final_scores)

print(final_scores_dict)
print(historical_scores)

final_prediction = []
for driver, historical_score in historical_scores.items():
    if driver in final_scores_dict:
        score = final_scores_dict[driver]
        
        combined_score = 0.6 * score + 0.4 * historical_score
        final_prediction.append((driver, combined_score))
    else:
        final_prediction.append((driver, historical_score))

final_prediction.sort(key=lambda x: x[1], reverse=True)

pos = 0
for driver, score in final_prediction:
    pos += 1
    print(f"{driver} finishes in position:{pos}      {score:.4f}")

# GLiNER

In [None]:
!pip install gliner

In [None]:
import os
os.environ["HF_HOME"] = "C:\\cache"
from gliner import GLiNER
model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

In [None]:
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'

df = f1_df.iloc[:2]


def has_prediction(post: pd.Series, threshold: float = 0.01) -> bool:
    position_entities = model.predict_entities(post['text'], ['position'], threshold=threshold) # TODO: very low threshold
    
    return len(position_entities) != 0

df['text'] = df['text'].apply(has_prediction)
display(df)

labels = ['racer', 'position']

entities = model.predict_entities(text, labels, threshold=0.01) # TODO: very low threshold

for entity in entities:
    print(entity['text'], '=>', entity['label'])