In [None]:
import time
import json
from pathlib import Path
import pandas as pd
from pandas.io.formats.style import Styler
from collections.abc import Generator, Callable
import typing
from typing import Any, TypeAlias
import numpy as np
from contextlib import contextmanager
from functools import partial, reduce
import re
import datetime as dt
from tqdm import tqdm
import pickle
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)
import importlib
import spacy

from config.fastf1 import fastf1
import fastf1.events as fastf1_events
from config import config
importlib.reload(config);
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
import src.data.preprocessing as preprocessing
importlib.reload(preprocessing);
import src.data.constants as dataset_constants
import src.utils
importlib.reload(src.utils);
from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

DEVICE = utils.get_device()

In [2]:
def load_f1_df(limit: int | None = None, in_place: bool = True) -> pd.DataFrame:
    ndjson_streamer = partial(stream_ndjson, limit=limit)

    return preprocessing.concatenate_submissions_and_comments(
        submissions_df=load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, ndjson_streamer),
        comments_df=load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, ndjson_streamer),
        in_place=in_place,
    )

def load_f15_df(limit: int | None = None, in_place: bool = True) -> pd.DataFrame:
    ndjson_streamer = partial(stream_ndjson, limit=limit)

    return preprocessing.concatenate_submissions_and_comments(
        submissions_df=load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS, ndjson_streamer),
        comments_df=load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS, ndjson_streamer),
        in_place=in_place,
    )

In [3]:
f1_ndjson_streamer = partial(stream_ndjson, limit=100)
f15_ndjson_streamer = partial(stream_ndjson, limit=100)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f15_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS, f15_ndjson_streamer)
f15_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS, f15_ndjson_streamer)

f1_df = preprocessing.concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = preprocessing.concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

In [None]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

In [None]:
n = 3

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

# Baseline: Rule-Based Prediction Extraction

In [6]:
# TODO:

# Fastf1 historical data

In [None]:
full_schedule = fastf1.get_event_schedule(dataset_constants.YEAR)
schedule = typing.cast(
    fastf1_events.EventSchedule,
    full_schedule[
        (full_schedule['EventDate'] >= dataset_constants.START_DATE) &
        (full_schedule['EventDate'] <= dataset_constants.END_DATE) &
        (full_schedule['EventFormat'] == 'conventional') # TODO: Skip sprint weekends for now. Also include sprint weekends later
    ],
)

with display_full_dataframe():
    display(schedule.iloc[-3:])

In [None]:
posts_df = f1_df
race_weekend = schedule.iloc[-1]
first_post_at = typing.cast(dt.datetime, race_weekend['Session1DateUtc']) - dt.timedelta(days=1)
last_post_at = typing.cast(dt.datetime, race_weekend['Session5DateUtc'])
posts_df = posts_df[
    (posts_df['created_utc'] >= first_post_at) &
    (posts_df['created_utc'] <= last_post_at)
]

def get_top20(race_weekend: fastf1_events.Event) -> pd.DataFrame:
    race_session = race_weekend.get_session('Race')
    race_session.load(laps=False, telemetry=False, weather=False, messages=False)
    top20 = race_session.results[['FullName', 'Position']].astype({'Position': np.uint8})
    return top20

top20s = tuple(
    get_top20(typing.cast(fastf1_events.Event, race_weekend))
    for _, race_weekend in schedule.iterrows()
)
display(hide_index(top20s[-1]))

# Pre-trained models

# Sentiment score:

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from scipy.special import softmax

def driver_sentiment(comments, driver_list, scores):
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    results = {driver: {"positive": 0.0, "neutral": 0.0, "negative": 0.0, "count": 0} for driver in driver_list}

    for comment, score in zip(comments, scores):
        found_drivers = [driver for driver in driver_list if driver in comment]
        
        for aspect in found_drivers:
            inputs = tokenizer(comment, aspect, return_tensors="pt", truncation=True, padding=True).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            sent = outputs.logits[0].cpu().numpy()
            probabilities = softmax(sent)

            results[aspect]["positive"] += probabilities[2] * score
            results[aspect]["neutral"] += probabilities[1] * score
            results[aspect]["negative"] += probabilities[0] * score
            results[aspect]["count"] += score

    for driver, sentiment in results.items():
        if sentiment["count"] > 0:
            sentiment["positive"] /= sentiment["count"]
            sentiment["neutral"] /= sentiment["count"]
            sentiment["negative"] /= sentiment["count"]

    return results


prediction_posts_df = ['Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, Max Verstappen will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!']
driver_list = ['Carlos Sainz', 'Max Verstappen']

F1_names= {
    'max verstappen',
    'charles leclerc',
    'sergio perez',
    'george russell',
    'carlos sainz',
    'lewis hamilton',
    'lando norris',
    'esteban ocon',
    'fernando alonso',
    'valtteri bottas',
    'daniel ricciardo',
    'sebastian vettel',
    'kevin magnussen',
    'pierre gasly',
    'lance stroll',
    'mick schumacher',
    'yuki tsunoda',
    'zhou guanyu',
    'alexander albon',
    'nicholas latifi',
    'nyck de vries',
    'nico hulkenberg',
    'oscar piastri',
    'liam lawson',
    'logan sargeant'
}

posts_df = load_f1_df(1000)
posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
comments = posts_df["text"].tolist()
scores = posts_df["score"].tolist()


results = driver_sentiment(comments, F1_names, scores)
print(results)
for driver, sentiment in results.items():
    if sentiment["count"] > 0:
        print(f"{driver}: [Positive: {sentiment['positive']:.4f}, Neutral: {sentiment['neutral']:.4f}, Negative: {sentiment['negative']:.4f}]")
    else:
        print(f"{driver}: No mentions found.")

In [None]:
def final_scores(results):
    final_scores = []

    for driver, sentiment in results.items():
        if sentiment["count"] > 0:
            sentiment_score = (sentiment["positive"] - sentiment["negative"])
            final_scores.append((driver.title(), sentiment_score))

    # Sort drivers by positive - negative score (descending order)
    final_scores.sort(key=lambda x: x[1], reverse=True)
    
    return final_scores

# Print sorted results
scores = final_scores(results)

print("Drivers ranked by (positive - negative score):")
for driver, score in scores:
    print(f"{driver}: {score:.4f}")


In [None]:
def prediction(n_event, final_scores, n_events=5, historical_score_contribution=0.4):
    #func to import historical data for this race
    historical_data_event = get_top20(full_schedule.iloc[n_event])
    historical_scores = {row["FullName"]: 0 for _, row in historical_data_event.iterrows()}

    for i in range(n_events):
        #func to import historical data for one of the last 5 races
        historical_data = get_top20(full_schedule.iloc[n_event - (i+1)])

        for _, row in historical_data.iterrows():
            historical_scores[row["FullName"]] += 1 - ((row["Position"] - 1) / 19) * 2

    for driver, score in historical_scores.items():
        historical_scores[driver] = score / n_events

    final_scores_dict = dict(final_scores)

    final_prediction = []
    for driver, historical_score in historical_scores.items():
        if driver in final_scores_dict:
            score = final_scores_dict[driver]
            
            combined_score = (1 - historical_score_contribution) * score + historical_score_contribution * historical_score
            final_prediction.append((driver, combined_score, score, historical_score))
        else:
            final_prediction.append((driver, historical_score, None, historical_score))

    final_prediction.sort(key=lambda x: x[1], reverse=True)
    final_prediction_dict = {
        driver: {"combined_score": comb_score, "sentiment_score": pred_score, "historical_score": hist_score}
        for driver, comb_score, pred_score, hist_score in final_prediction
    }
    # Calculate MAE
    # Map drivers to their predicted positions
    predicted_positions = {driver: i + 1 for i, (driver, _, _, _) in enumerate(final_prediction)}

    # Map drivers to their true positions
    true_positions = {row["FullName"]: row["Position"] for _, row in historical_data_event.iterrows()}

    # Compute absolute errors for drivers present in both sets
    data = []
    for driver, predicted_position in predicted_positions.items():
        true_position = true_positions.get(driver, None)
        scores = final_prediction_dict.get(driver, {"combined_score": None, "sentiment_score": None, "historical_score": None})

        error = abs(predicted_position - true_position) if true_position is not None else None
        data.append({
            "driver_name": driver,
            "predicted_position": predicted_position,
            "true_position": true_position,
            "error": error,
            "combined_score": scores["combined_score"],
            "sentiment_score": scores["sentiment_score"],
            "historical_score": scores["historical_score"]
        })

    prediction_df = pd.DataFrame(data)

    return prediction_df

index = 9
prediction_df = prediction(index, scores, n_events=5, historical_score_contribution=0.4)
display(prediction_df)
mae = prediction_df["error"].mean()
print(mae)
position = 0
# for driver, score in final_prediction:
#     position += 1
#     print(f"{driver} finishes in position:{position}      {score:.4f}")

# GLiNER

In [None]:
import os
os.environ["HF_HOME"] = "C:\\cache"
from gliner import GLiNER

gliner_pickle_path = config.DATA_DIR / '.cache' / 'gliner_model.pkl'
gliner_pickle_path.parent.mkdir(parents=True, exist_ok=True)
use_cache = False

if use_cache:
    if not gliner_pickle_path.exists():
        gliner_model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

        with open(gliner_pickle_path, 'wb') as file:
            pickle.dump(gliner_model, file)
    else:
        with open(gliner_pickle_path, 'rb') as file:
            gliner_model = pickle.load(file)
else:
    gliner_model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

gliner_model.to(DEVICE);

In [None]:
print(type(gliner_model))

In [14]:
nlp = spacy.load('en_core_web_sm')
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'
debug = True

if debug:
    doc = nlp(text)
    df = pd.DataFrame({'text': tuple(sentence.text for sentence in doc.sents)})
else:
    df = load_f1_df(1000)

In [None]:
with display_full_dataframe():
    display(df.head())

def has_prediction(post_text: str, threshold: float = 0.45) -> bool:
    # doc = nlp(post_text)

    # TODO: does GLiNER's performance improve with more context? if yes, refactor to chunking instead of going over each sentence individually
    # for sentence in doc.sents:
    # TODO: for some reason, if you include only 'position', the predictions are far worse than with 'driver' included
    with torch.no_grad():
        entities = gliner_model.predict_entities(post_text, ('driver', 'position',), threshold=threshold) # TODO: very low threshold
    position_entities = tuple(entity for entity in entities if entity['label'] == 'position')

    if debug:
        print(position_entities)
        print(tuple(position['text'] for position in position_entities))

    if len(position_entities) != 0:
        return True

    return False


predictions_df = df[df['text'].apply(has_prediction)]
print(len(predictions_df))
with display_full_dataframe():
    display(predictions_df.head())

In [16]:
# def has_prediction_dask(post_text):
#     return has_prediction(post_text)

In [None]:
# import dask.dataframe as dd

# # Convert the Pandas DataFrame to a Dask DataFrame
# dask_df = dd.from_pandas(load_f1_df(10), npartitions=16)  # Adjust the number of partitions as needed

# # Apply the function in parallel
# dask_df['has_prediction'] = dask_df['text'].map(has_prediction_dask, meta=('text', 'bool'))

# # Compute the result and convert back to a Pandas DataFrame
# result_df = dask_df[dask_df['has_prediction']].compute()

# # Display the filtered DataFrame
# with display_full_dataframe():
#     display(result_df.head())
historical_data_event = get_top20(full_schedule.iloc[index])
true_positions = {row["FullName"]: row["Position"] for _, row in historical_data_event.iterrows()}
display(true_positions)

In [None]:
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'
debug = False
import dask.dataframe as dd
import warnings
warnings.filterwarnings(
    "ignore",
    message="Sentence of length .* has been truncated to .*",
    category=UserWarning
)

if debug:
    doc = nlp(text)
    posts_df = pd.DataFrame({'text': tuple(sentence.text for sentence in doc.sents)})
else:
    start = time.perf_counter_ns()
    all_posts_df = load_f1_df()
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "load time")

# with display_full_dataframe():
#     display(hide_index(df.head()))

# df['text'] = df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
# df = df[df['text'].apply(has_prediction)]

def display_posts_df(n=3):
    global posts_df
    
    with display_full_dataframe():
        display(hide_index(posts_df.head(n)))

predictions_dict = {}
mae_list = []

for index, race_weekend in schedule.iterrows():
    print(index, race_weekend)
    #load relevant post
    first_post_at = typing.cast(dt.datetime, race_weekend['Session4DateUtc']) #- dt.timedelta(days=1)
    last_post_at = typing.cast(dt.datetime, race_weekend['Session5DateUtc'])
    posts_df = all_posts_df[
        (all_posts_df['created_utc'] >= first_post_at) &
        (all_posts_df['created_utc'] <= last_post_at)
    ]
    print('number of posts for event: ', len(posts_df))

    # spelling correction
    start = time.perf_counter_ns()
    posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "spell time")

    # only predictions
    start = time.perf_counter_ns()

    # posts_ddf = dd.from_pandas(posts_df, npartitions=16)
    # display(posts_ddf.head())
    # has_prediction = posts_ddf['text'].map_partitions(has_prediction, meta=('text', 'bool'))
    # print(type(has_prediction))
    # print(has_prediction)
    # print(has_prediction.compute())
    # posts_df = posts_ddf[has_prediction].compute()
    
    posts_df = posts_df[posts_df['text'].apply(has_prediction)]
    print('number of posts with prediction: ', len(posts_df))
    # with display_full_dataframe():
    #     display(predictions_df.head())

    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "filter time")

    # sentiment score
    start = time.perf_counter_ns()
    comments = posts_df["text"].tolist()
    upvotes = posts_df["score"].tolist()
    sentiment = driver_sentiment(comments, F1_names, upvotes)
    scores = final_scores(sentiment)
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "sentiment time")

    # final prediction
    start = time.perf_counter_ns()
    prediction_df = prediction(index, scores, n_events=5, historical_score_contribution=0.4)
    mae = prediction_df["error"].mean()
    display(prediction_df)
    print("the MAE for predicted vs true position is: ", mae)
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "final pred time")

    predictions_dict[index] = prediction_df
    print("\n" * 3)
    

with pd.ExcelWriter(config.DATA_DIR /final/"predictions.xlsx") as writer:
    for key, df in predictions_dict.items():
        df.to_excel(writer, sheet_name=f"Iteration_{key}", index=False)

# loaded_predictions = pd.read_excel(config.DATA_DIR /final/"predictions.xlsx", sheet_name=None)  # Returns a dictionary of DataFrames



In [None]:
print("Drivers ranked by (positive - negative score):")
for driver, score in scores:
    print(f"{driver}: {score:.4f}")

In [None]:
print(predictions_dict)

In [None]:
# sentiment score
start = time.perf_counter_ns()
comments = posts_df["text"].tolist()
upvotes = posts_df["score"].tolist()
sentiment = driver_sentiment(comments, F1_names, upvotes)
scores = final_scores(sentiment)
end = time.perf_counter_ns()
print((end - start) / 10 ** 9, "sentiment time")
# print("Drivers ranked by (positive - negative score):")
# for driver, score in scores:
#     print(f"{driver}: {score:.4f}")

# final prediction
start = time.perf_counter_ns()
pred = prediction(index, scores, n_events=5, historical_score_contribution=0.75)
end = time.perf_counter_ns()
print((end - start) / 10 ** 9, "final pred time")


predictions_dict[index] = pred
position = 0
for driver, score in pred:
    position += 1
    print(f"{driver} finishes in position:{position}      {score:.4f}")

In [None]:
validation_posts = load_f1_df(2000)['text']
display(validation_posts.head())
validation_posts = validation_posts.apply(preprocessing.correct_spelling_in_text_spacy)
validation_posts.to_csv(config.DATA_DIR /'file1.csv')
# add new collumn with the label for each driver in the form: {'nyck de vries': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'alexander albon': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'yuki tsunoda': {'positive': 0.0057728992807760575, 'neutral': 0.9906835493288542, 'negative': 0.0035435524033872704, 'count': 57}, 'nicholas latifi': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'lance stroll': {'positive': 0.0025615381891839206, 'neutral': 0.8542155921459198, 'negative': 0.1432228833436966, 'count': 2}, 'sergio perez': {'positive': 0.19108588388694617, 'neutral': 0.7905566372219701, 'negative': 0.018357482762380587, 'count': 19643}, 'liam lawson': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'logan sargeant': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'oscar piastri': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'kevin magnussen': {'positive': 0.3014230728149414, 'neutral': 0.13589109480381012, 'negative': 0.5626858472824097, 'count': 119}, 'charles leclerc': {'positive': 0.3298702842287487, 'neutral': 0.6215965418101896, 'negative': 0.04853320448722903, 'count': 5804}, 'lewis hamilton': {'positive': 0.19249720936472856, 'neutral': 0.6415854867461116, 'negative': 0.1659173017253239, 'count': 602}, 'max verstappen': {'positive': 0.027957634713532145, 'neutral': 0.86048945951617, 'negative': 0.11155290395479907, 'count': 37483}, 'lando norris': {'positive': 0.10887042551651653, 'neutral': 0.8807938621942907, 'negative': 0.010335699243584025, 'count': 7665}, 'sebastian vettel': {'positive': 0.21298285713947993, 'neutral': 0.3937424868611979, 'negative': 0.39327468428606643, 'count': 9777}, 'mick schumacher': {'positive': 0.22496380869912483, 'neutral': 0.172746341228788, 'negative': 0.6022898651057537, 'count': 123}, 'fernando alonso': {'positive': 0.6737411749587325, 'neutral': 0.30806004730469894, 'negative': 0.018198754213155918, 'count': 8161}, 'zhou guanyu': {'positive': 0.003083703340962529, 'neutral': 0.9933833479881287, 'negative': 0.0035329661332070827, 'count': 5230}, 'nico hulkenberg': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'carlos sainz': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'esteban ocon': {'positive': 0.0, 'neutral': 0.0, 'negative': 0.0, 'count': 0}, 'daniel ricciardo': {'positive': 0.2623085604583092, 'neutral': 0.7047550600192255, 'negative': 0.03293641074422787, 'count': 5407}, 'valtteri bottas': {'positive': 0.0030204161646588666, 'neutral': 0.9952852886277141, 'negative': 0.0016942658217249356, 'count': 383}, 'george russell': {'positive': 0.019467041386267196, 'neutral': 0.24923343850033625, 'negative': 0.7312994546244864, 'count': 1848}, 'pierre gasly': {'positive': 0.15713243364682997, 'neutral': 0.7395840842833875, 'negative': 0.10328345954387029, 'count': 17877}}
# load

for i, gpt_df in enumerate(ChatGPT_posts):
    comment = gpt_df["text"].iloc[i]
    score = gpt_df["score"].iloc[i]

    results = driver_sentiment(comment, F1_names, score)
    # compare chatgpt column with results
    # mae = abs()

    