In [None]:
import time
import json
from pathlib import Path
import pandas as pd
from pandas.io.formats.style import Styler
from collections.abc import Generator, Callable
import typing
from typing import Any, TypeAlias
import numpy as np
from contextlib import contextmanager
from functools import partial, reduce
import re
import datetime as dt
from tqdm import tqdm
import pickle
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)
import importlib
import spacy

from config.fastf1 import fastf1
import fastf1.events as fastf1_events
from config import config
importlib.reload(config);
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
import src.data.preprocessing as preprocessing
importlib.reload(preprocessing);
import src.data.constants as dataset_constants
import src.utils
importlib.reload(src.utils);
from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

DEVICE = utils.get_device()

In [3]:
def load_f1_df(limit: int | None = None, in_place: bool = True) -> pd.DataFrame:
    ndjson_streamer = partial(stream_ndjson, limit=limit)

    return preprocessing.concatenate_submissions_and_comments(
        submissions_df=load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, ndjson_streamer),
        comments_df=load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, ndjson_streamer),
        in_place=in_place,
    )

def load_f15_df(limit: int | None = None, in_place: bool = True) -> pd.DataFrame:
    ndjson_streamer = partial(stream_ndjson, limit=limit)

    return preprocessing.concatenate_submissions_and_comments(
        submissions_df=load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS, ndjson_streamer),
        comments_df=load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS, ndjson_streamer),
        in_place=in_place,
    )

In [4]:
f1_ndjson_streamer = partial(stream_ndjson, limit=100)
f15_ndjson_streamer = partial(stream_ndjson, limit=100)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f15_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS, f15_ndjson_streamer)
f15_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS, f15_ndjson_streamer)

f1_df = preprocessing.concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = preprocessing.concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

In [None]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

In [None]:
n = 3

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

# Baseline: Rule-Based Prediction Extraction

In [7]:
# TODO:

# Fastf1 historical data

In [None]:
full_schedule = fastf1.get_event_schedule(dataset_constants.YEAR)
schedule = typing.cast(
    fastf1_events.EventSchedule,
    full_schedule[
        (full_schedule['EventDate'] >= dataset_constants.START_DATE) &
        (full_schedule['EventDate'] <= dataset_constants.END_DATE) &
        (full_schedule['EventFormat'] == 'conventional') # TODO: Skip sprint weekends for now. Also include sprint weekends later
    ],
)

with display_full_dataframe():
    display(schedule.iloc[-3:])

In [None]:
posts_df = f1_df
race_weekend = schedule.iloc[-1]
first_post_at = typing.cast(dt.datetime, race_weekend['Session1DateUtc']) - dt.timedelta(days=1)
last_post_at = typing.cast(dt.datetime, race_weekend['Session5DateUtc'])
posts_df = posts_df[
    (posts_df['created_utc'] >= first_post_at) &
    (posts_df['created_utc'] <= last_post_at)
]

def get_top20(race_weekend: fastf1_events.Event) -> pd.DataFrame:
    race_session = race_weekend.get_session('Race')
    race_session.load(laps=False, telemetry=False, weather=False, messages=False)
    top20 = race_session.results[['FullName', 'Position']].astype({'Position': np.uint8})
    return top20

top20s = tuple(
    get_top20(typing.cast(fastf1_events.Event, race_weekend))
    for _, race_weekend in schedule.iterrows()
)
display(hide_index(top20s[-1]))

# Pre-trained models

# Sentiment score:

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from scipy.special import softmax

def driver_sentiment(comments, driver_list):
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    results = {driver: {"positive": 0.0, "neutral": 0.0, "negative": 0.0, "count": 0} for driver in driver_list}

    for comment in comments:
        found_drivers = [driver for driver in driver_list if driver in comment]
        
        for aspect in found_drivers:
            inputs = tokenizer(comment, aspect, return_tensors="pt", truncation=True, padding=True).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            scores = outputs.logits[0].cpu().numpy()
            probabilities = softmax(scores)

            results[aspect]["positive"] += probabilities[2]
            results[aspect]["neutral"] += probabilities[1]
            results[aspect]["negative"] += probabilities[0]
            results[aspect]["count"] += 1

    for driver, sentiment in results.items():
        if sentiment["count"] > 0:
            sentiment["positive"] /= sentiment["count"]
            sentiment["neutral"] /= sentiment["count"]
            sentiment["negative"] /= sentiment["count"]

    return results


prediction_posts_df = ['Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, Max Verstappen will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!']
driver_list = ['Carlos Sainz', 'Max Verstappen']

results = driver_sentiment(prediction_posts_df, driver_list)

for driver, sentiment in results.items():
    if sentiment["count"] > 0:
        print(f"{driver}: [Positive: {sentiment['positive']:.4f}, Neutral: {sentiment['neutral']:.4f}, Negative: {sentiment['negative']:.4f}]")
    else:
        print(f"{driver}: No mentions found.")

In [None]:
def final_scores(results):
    final_scores = []

    for driver, sentiment in results.items():
        if sentiment["count"] > 0:
            sentiment_score = (sentiment["positive"] - sentiment["negative"])
            final_scores.append((driver.title(), sentiment_score))

    # Sort drivers by positive - negative score (descending order)
    final_scores.sort(key=lambda x: x[1], reverse=True)
    
    return final_scores

# Print sorted results
scores = final_scores(results)

print("Drivers ranked by (positive - negative score):")
for driver, score in scores:
    print(f"{driver}: {score:.4f}")


In [12]:
def prediction(n_event, final_scores, n_events=5, historical_score_contribution=0.4):
    #func to import historical data for this race
    historical_data = get_top20(n_event)
    historical_scores = {row["DriverFullName"]: 0 for _, row in historical_data.iterrows()}

    for i in range(n_events):
        #func to import historical data for one of the last 5 races
        historical_data = get_top20(n_event - (i+1))

        for _, row in historical_data.iterrows():
            historical_scores[row["DriverFullName"]] += 1 - ((row["Pos"] - 1) / 19) * 2

    for driver, score in historical_scores.items():
        historical_scores[driver] = score / n_events

    final_scores_dict = dict(final_scores)

    final_prediction = []
    for driver, historical_score in historical_scores.items():
        if driver in final_scores_dict:
            score = final_scores_dict[driver]
            
            combined_score = (1 - historical_score_contribution) * score + historical_score_contribution * historical_score
            final_prediction.append((driver, combined_score))
        else:
            final_prediction.append((driver, historical_score))

    final_prediction.sort(key=lambda x: x[1], reverse=True)

    return final_prediction

# final_prediction = prediction(16, scores, n_events=5, historical_score_contribution=0.4)
# pos = 0
# for driver, score in final_prediction:
#     pos += 1
#     print(f"{driver} finishes in position:{pos}      {score:.4f}")

# GLiNER

In [None]:
import os
os.environ["HF_HOME"] = "C:\\cache"
from gliner import GLiNER

gliner_pickle_path = config.DATA_DIR / '.cache' / 'gliner_model.pkl'
gliner_pickle_path.parent.mkdir(parents=True, exist_ok=True)
use_cache = True

if use_cache:
    if not gliner_pickle_path.exists():
        gliner_model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

        with open(gliner_pickle_path, 'wb') as file:
            pickle.dump(gliner_model, file)
    else:
        with open(gliner_pickle_path, 'rb') as file:
            gliner_model = pickle.load(file)
else:
    gliner_model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

gliner_model.to(DEVICE);

In [15]:
nlp = spacy.load('en_core_web_sm')
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'
debug = False

if debug:
    doc = nlp(text)
    df = pd.DataFrame({'text': tuple(sentence.text for sentence in doc.sents)})
else:
    df = load_f1_df(10)

In [None]:
with display_full_dataframe():
    display(df)

def has_prediction(post_text: str, threshold: float = 0.45) -> bool:
    doc = nlp(post_text)

    # TODO: does GLiNER's performance improve with more context? if yes, refactor to chunking instead of going over each sentence individually
    for sentence in doc.sents:
        # TODO: for some reason, if you include only 'position', the predictions are far worse than with 'driver' included
        entities = gliner_model.predict_entities(sentence.text, ('driver', 'position',), threshold=threshold) # TODO: very low threshold
        position_entities = tuple(entity for entity in entities if entity['label'] == 'position')

        if debug:
            print(position_entities)
            print(tuple(position['text'] for position in position_entities))

        if len(position_entities) != 0:
            return True
    
    return False

predictions_df = df[df['text'].apply(has_prediction)]

with display_full_dataframe():
    display(predictions_df)

In [None]:
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'
debug = False

if debug:
    doc = nlp(text)
    df = pd.DataFrame({'text': tuple(sentence.text for sentence in doc.sents)})
else:
    df = load_f1_df(10)

# with display_full_dataframe():
#     display(hide_index(df.head()))

df['text'] = df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
df = df[df['text'].apply(has_prediction)]

for index, race_weekend in schedule.iterrows():
    pred_comments = has_prediction(post, threshold=0.01)
    sentiment = driver_sentiment(pred_comments, driver_list)
    score = final_scores(sentiment)
    pred = prediction(race_weekend, final_scores, n_events=5, historical_score_contribution=0.4)
    print(pred)

    break
