In [2]:
import time
import json
from pathlib import Path
import pandas as pd
from pandas.io.formats.style import Styler
from collections.abc import Generator, Callable
import typing
from typing import Any, TypeAlias
import numpy as np
from contextlib import contextmanager
from functools import partial, reduce
import re
import datetime as dt
from tqdm import tqdm
import pickle
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)
import importlib
import spacy

from config.fastf1 import fastf1
import fastf1.events as fastf1_events
from config import config
importlib.reload(config);
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
import src.data.preprocessing as preprocessing
importlib.reload(preprocessing);
import src.data.constants as dataset_constants
import src.utils
importlib.reload(src.utils);
from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

DEVICE = utils.get_device()

PyTorch version: 2.5.1+cu124
CUDA available: True
CUDA version: 12.4
Selected GPU: NVIDIA GeForce GTX 1080 Ti (device_id=0)


In [3]:
def load_f1_df(limit: int | None = None, in_place: bool = True) -> pd.DataFrame:
    ndjson_streamer = partial(stream_ndjson, limit=limit)

    return preprocessing.concatenate_submissions_and_comments(
        submissions_df=load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, ndjson_streamer),
        comments_df=load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, ndjson_streamer),
        in_place=in_place,
    )

def load_f15_df(limit: int | None = None, in_place: bool = True) -> pd.DataFrame:
    ndjson_streamer = partial(stream_ndjson, limit=limit)

    return preprocessing.concatenate_submissions_and_comments(
        submissions_df=load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS, ndjson_streamer),
        comments_df=load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS, ndjson_streamer),
        in_place=in_place,
    )

In [4]:
f1_ndjson_streamer = partial(stream_ndjson, limit=100)
f15_ndjson_streamer = partial(stream_ndjson, limit=100)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f15_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS, f15_ndjson_streamer)
f15_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS, f15_ndjson_streamer)

f1_df = preprocessing.concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = preprocessing.concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

In [5]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

### r/formula1 submissions:

Unnamed: 0,gilded,score,author,title,created_utc,id,selftext
0,0,1,[deleted],[Discussion] Could professional ESports drivers drive a real F1 car? How realistic are the sims?,2022-06-01 12:00:41,v2fbpg,[removed]
1,0,2,Doomaster14,Questions concerning Alonso's future,2022-06-01 12:07:50,v2fh6w,[removed]
2,0,1393,motorace_addict,Verstappen now has as many poles as Leclerc - but six times as many wins | 2022 Monaco Grand Prix stats and facts,2022-06-01 12:15:14,v2fmeh,
3,0,161,MrTuxedo1,Perez wins as Red Bull delivers race strategy blow to Ferrari - Mika Häkkinen’s thoughts on the Monaco Grand Prix,2022-06-01 12:23:16,v2frea,


### r/formula1 comments:

Unnamed: 0,gilded,score,body,author,created_utc,id
0,0,1,top part of the wing got shaken off in the tunnel.,CowsWantToKillMe,2022-06-01 00:00:57,iaq4tev
1,0,0,That's been the rumour with Mercedes lately cuz in previous seasons Bottas hasn't been the luckiest.,doc_55lk,2022-06-01 00:01:15,iaq4urr
2,0,3,"Ah well, it's looking great already!",Organic-Measurement2,2022-06-01 00:01:41,iaq4wpz
3,0,10,And Ferrari would get them all wrong.,not_right,2022-06-01 00:01:46,iaq4x1h


### r/formula1point5 submissions:

Unnamed: 0,gilded,score,author,title,created_utc,id,selftext
0,0,1,orfeomclaren,Formula 1 - Hakkinen vs Schumacher - Spa-Francorchamps 2000,2022-06-07 09:21:41,v6qyud,
1,0,1,orfeomclaren,Formula 1 2003 - Rd 2 - Malaysian Grand Prix [Highlights] - Kimi Raikkonen Maiden Win,2022-06-07 13:26:25,v6viae,
2,0,1,orfeomclaren,Formula 1 2003 - Rd 9 - European Grand Prix (Nurburgring) [Highlights],2022-06-09 08:12:22,v8bwj6,
3,0,1,ms_creativity,Red Bull drivers free to fight each other,2022-06-09 11:48:11,v8f1dk,


### r/formula1point5 comments:

Unnamed: 0,gilded,score,body,author,created_utc,id
0,0,3,What is your team name please?,debrek,2022-06-01 03:50:49,iaqwofj
1,0,2,"It's lazily named team F1.5 and my name there is the same as my username here (Ignis Vizsla), I'm 34th on the leaderboard there for reference",IgnisVizsla,2022-06-01 05:54:28,iar7xgu
2,0,3,I had removed you as I thought you were inactive since you had a number of teams with an invalid team. I re-added you to the list.,debrek,2022-06-01 06:20:29,iar9z0m
3,0,3,"Yeah that's my fault, I forgot to update my team after the rules changed as I always remembered only after quali and that was too late, I finally changed before Monaco though",IgnisVizsla,2022-06-01 06:49:13,iarc3x7


In [6]:
n = 3

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

### r/formula1 posts:

Unnamed: 0,gilded,score,author,created_utc,id,text
0,0,1,[deleted],2022-06-01 12:00:41,v2fbpg,[Discussion] Could professional ESports drivers drive a real F1 car? How realistic are the sims? [removed]
1,0,2,Doomaster14,2022-06-01 12:07:50,v2fh6w,Questions concerning Alonso's future. [removed]
2,0,1393,motorace_addict,2022-06-01 12:15:14,v2fmeh,Verstappen now has as many poles as Leclerc - but six times as many wins | 2022 Monaco Grand Prix stats and facts.


### r/formula1point5 posts:

Unnamed: 0,gilded,score,author,created_utc,id,text
0,0,1,orfeomclaren,2022-06-07 09:21:41,v6qyud,Formula 1 - Hakkinen vs Schumacher - Spa-Francorchamps 2000.
1,0,1,orfeomclaren,2022-06-07 13:26:25,v6viae,Formula 1 2003 - Rd 2 - Malaysian Grand Prix [Highlights] - Kimi Raikkonen Maiden Win.
2,0,1,orfeomclaren,2022-06-09 08:12:22,v8bwj6,Formula 1 2003 - Rd 9 - European Grand Prix (Nurburgring) [Highlights]


# Baseline: Rule-Based Prediction Extraction

# Fastf1 historical data

In [7]:
full_schedule = fastf1.get_event_schedule(dataset_constants.YEAR)
schedule = typing.cast(
    fastf1_events.EventSchedule,
    full_schedule[
        (full_schedule['EventDate'] >= dataset_constants.START_DATE) &
        (full_schedule['EventDate'] <= dataset_constants.END_DATE) &
        (full_schedule['EventFormat'] == 'conventional') # TODO: Skip sprint weekends for now. Also include sprint weekends later
    ],
)

with display_full_dataframe():
    display(schedule.iloc[-3:])

Unnamed: 0,RoundNumber,Country,Location,OfficialEventName,EventDate,EventName,EventFormat,Session1,Session1Date,Session1DateUtc,Session2,Session2Date,Session2DateUtc,Session3,Session3Date,Session3DateUtc,Session4,Session4Date,Session4DateUtc,Session5,Session5Date,Session5DateUtc,F1ApiSupport
20,19,United States,Austin,FORMULA 1 ARAMCO UNITED STATES GRAND PRIX 2022,2022-10-23,United States Grand Prix,conventional,Practice 1,2022-10-21 14:00:00-05:00,2022-10-21 19:00:00,Practice 2,2022-10-21 17:00:00-05:00,2022-10-21 22:00:00,Practice 3,2022-10-22 14:00:00-05:00,2022-10-22 19:00:00,Qualifying,2022-10-22 17:00:00-05:00,2022-10-22 22:00:00,Race,2022-10-23 14:00:00-05:00,2022-10-23 19:00:00,True
21,20,Mexico,Mexico City,FORMULA 1 HEINEKEN GRAN PREMIO DE LA CIUDAD DE MÉXICO 2022,2022-10-30,Mexico City Grand Prix,conventional,Practice 1,2022-10-28 13:00:00-06:00,2022-10-28 19:00:00,Practice 2,2022-10-28 16:00:00-06:00,2022-10-28 22:00:00,Practice 3,2022-10-29 12:00:00-06:00,2022-10-29 18:00:00,Qualifying,2022-10-29 15:00:00-06:00,2022-10-29 21:00:00,Race,2022-10-30 14:00:00-06:00,2022-10-30 20:00:00,True
23,22,Abu Dhabi,Yas Island,FORMULA 1 ETIHAD AIRWAYS ABU DHABI GRAND PRIX 2022,2022-11-20,Abu Dhabi Grand Prix,conventional,Practice 1,2022-11-18 14:00:00+04:00,2022-11-18 10:00:00,Practice 2,2022-11-18 17:00:00+04:00,2022-11-18 13:00:00,Practice 3,2022-11-19 14:30:00+04:00,2022-11-19 10:30:00,Qualifying,2022-11-19 18:00:00+04:00,2022-11-19 14:00:00,Race,2022-11-20 17:00:00+04:00,2022-11-20 13:00:00,True


In [8]:
posts_df = f1_df
race_weekend = schedule.iloc[-1]
first_post_at = typing.cast(dt.datetime, race_weekend['Session1DateUtc']) - dt.timedelta(days=1)
last_post_at = typing.cast(dt.datetime, race_weekend['Session5DateUtc'])
posts_df = posts_df[
    (posts_df['created_utc'] >= first_post_at) &
    (posts_df['created_utc'] <= last_post_at)
]

def get_top20(race_weekend: fastf1_events.Event) -> pd.DataFrame:
    race_session = race_weekend.get_session('Race')
    race_session.load(laps=False, telemetry=False, weather=False, messages=False)
    top20 = race_session.results[['FullName', 'Position']].astype({'Position': np.uint8})
    return top20

top20s = tuple(
    get_top20(typing.cast(fastf1_events.Event, race_weekend))
    for _, race_weekend in schedule.iterrows()
)
display(hide_index(top20s[-1]))

FullName,Position
Max Verstappen,1
Charles Leclerc,2
Sergio Perez,3
Carlos Sainz,4
George Russell,5
Lando Norris,6
Esteban Ocon,7
Lance Stroll,8
Daniel Ricciardo,9
Sebastian Vettel,10


# Pre-trained models

# Sentiment score:

In [9]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from scipy.special import softmax

def driver_sentiment(comments, driver_list, scores):
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    results = {driver: {"positive": 0.0, "neutral": 0.0, "negative": 0.0, "count": 0} for driver in driver_list}

    for comment, score in zip(comments, scores):
        found_drivers = [driver for driver in driver_list if driver in comment]
        
        for aspect in found_drivers:
            inputs = tokenizer(comment, aspect, return_tensors="pt", truncation=True, padding=True).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            sent = outputs.logits[0].cpu().numpy()
            probabilities = softmax(sent)

            results[aspect]["positive"] += probabilities[2] * score
            results[aspect]["neutral"] += probabilities[1] * score
            results[aspect]["negative"] += probabilities[0] * score
            results[aspect]["count"] += score

    for driver, sentiment in results.items():
        if sentiment["count"] > 0:
            sentiment["positive"] /= sentiment["count"]
            sentiment["neutral"] /= sentiment["count"]
            sentiment["negative"] /= sentiment["count"]

    return results


prediction_posts_df = ['Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, Max Verstappen will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!']
driver_list = ['Carlos Sainz', 'Max Verstappen']

F1_names= {
    'max verstappen',
    'charles leclerc',
    'sergio perez',
    'george russell',
    'carlos sainz',
    'lewis hamilton',
    'lando norris',
    'esteban ocon',
    'fernando alonso',
    'valtteri bottas',
    'daniel ricciardo',
    'sebastian vettel',
    'kevin magnussen',
    'pierre gasly',
    'lance stroll',
    'mick schumacher',
    'yuki tsunoda',
    'zhou guanyu',
    'alexander albon',
    'nicholas latifi',
    'nyck de vries',
    'nico hulkenberg',
    'oscar piastri',
    'liam lawson',
    'logan sargeant'
}

posts_df = load_f1_df(1000)
posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
comments = posts_df["text"].tolist()
scores = posts_df["score"].tolist()


results = driver_sentiment(comments, F1_names, scores)
print(results)
for driver, sentiment in results.items():
    if sentiment["count"] > 0:
        print(f"{driver}: [Positive: {sentiment['positive']:.4f}, Neutral: {sentiment['neutral']:.4f}, Negative: {sentiment['negative']:.4f}]")
    else:
        print(f"{driver}: No mentions found.")

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'daniel ricciardo': {'positive': 0.2476961301436094, 'neutral': 0.5786894359994762, 'negative': 0.17361445376296974, 'count': 9051}, 'lance stroll': {'positive': 0.015630004871924227, 'neutral': 0.5473516535350866, 'negative': 0.4370183113315301, 'count': 15398}, 'nyck de vries': {'positive': 0.7035142779350281, 'neutral': 0.2487473487854004, 'negative': 0.047738347202539444, 'count': 251}, 'zhou guanyu': {'positive': 0.011158349475872614, 'neutral': 0.9709595218149216, 'negative': 0.01788214706901745, 'count': 5406}, 'oscar piastri': {'positive': 0.37583865025045726, 'neutral': 0.4010116077333988, 'negative': 0.22314974779081076, 'count': 399}, 'george russell': {'positive': 0.039409447393855934, 'neutral': 0.6390031803903622, 'negative': 0.3215873683707211, 'count': 4580}, 'fernando alonso': {'positive': 0.547514488389884, 'neutral': 0.3375483624678872, 'negative': 0.11493712610482584, 'count': 10307}, 'charles leclerc': {'positive': 0.0505117512855543, 'neutral': 0.804646265194513,

In [24]:
def final_scores(results):
    final_scores = []

    for driver, sentiment in results.items():
        if sentiment["count"] > 0:
            sentiment_score = (sentiment["positive"] - sentiment["negative"])
            final_scores.append((driver.title(), sentiment_score))

    # Sort drivers by positive - negative score (descending order)
    final_scores.sort(key=lambda x: x[1], reverse=True)
    
    return final_scores

# Print sorted results
scores = final_scores(results)

print("Drivers ranked by (positive - negative score):")
for driver, score in scores:
    print(f"{driver}: {score:.4f}")


Drivers ranked by (positive - negative score):
Max Verstappen: -0.0032


In [72]:
def prediction(n_event, final_scores, n_events=5, historical_score_contribution=0.4):
    #func to import historical data for this race
    historical_data_event = get_top20(full_schedule.iloc[n_event])
    historical_scores = {row["FullName"]: 0 for _, row in historical_data_event.iterrows()}

    for i in range(n_events):
        #func to import historical data for one of the last 5 races
        historical_data = get_top20(full_schedule.iloc[n_event - (i+1)])

        for _, row in historical_data.iterrows():
            if row["FullName"] not in historical_scores:
                historical_scores[row["FullName"]] = 0  # Initialize missing driver
            historical_scores[row["FullName"]] += 1 - ((row["Position"] - 1) / 19) * 2

    for driver, score in historical_scores.items():
        historical_scores[driver] = score / n_events

    final_scores_dict = dict(final_scores)

    final_prediction = []
    for driver, historical_score in historical_scores.items():
        if driver in final_scores_dict:
            score = final_scores_dict[driver]
            
            combined_score = (1 - historical_score_contribution) * score + historical_score_contribution * historical_score
            final_prediction.append((driver, combined_score, score, historical_score))
        else:
            final_prediction.append((driver, historical_score, None, historical_score))

    final_prediction.sort(key=lambda x: x[1], reverse=True)
    final_prediction_dict = {
        driver: {"combined_score": comb_score, "sentiment_score": pred_score, "historical_score": hist_score}
        for driver, comb_score, pred_score, hist_score in final_prediction
    }
    # Calculate MAE
    # Map drivers to their predicted positions
    predicted_positions = {driver: i + 1 for i, (driver, _, _, _) in enumerate(final_prediction)}

    # Map drivers to their true positions
    true_positions = {row["FullName"]: row["Position"] for _, row in historical_data_event.iterrows()}

    # Compute absolute errors for drivers present in both sets
    data = []
    for driver, predicted_position in predicted_positions.items():
        true_position = true_positions.get(driver, None)
        scores = final_prediction_dict.get(driver, {"combined_score": None, "sentiment_score": None, "historical_score": None})

        error = abs(predicted_position - true_position) if true_position is not None else None
        data.append({
            "driver_name": driver,
            "predicted_position": predicted_position,
            "true_position": true_position,
            "error": error,
            "combined_score": scores["combined_score"],
            "sentiment_score": scores["sentiment_score"],
            "historical_score": scores["historical_score"]
        })

    prediction_df = pd.DataFrame(data)

    return prediction_df

index = 9
prediction_df = prediction(index, scores, n_events=5, historical_score_contribution=0.4)
display(prediction_df)
mae = prediction_df["error"].mean()
print(mae)
position = 0
# for driver, score in final_prediction:
#     position += 1
#     print(f"{driver} finishes in position:{position}      {score:.4f}")

Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Charles Leclerc,1,19,18,0.359033,0.324704,0.410526
1,Sebastian Vettel,2,6,4,0.288644,0.656511,-0.263158
2,Carlos Sainz,3,20,17,0.210026,0.300921,0.073684
3,Max Verstappen,4,1,3,0.139215,-0.167976,0.6
4,Sergio Perez,5,2,3,0.115176,-0.390496,0.873684
5,George Russell,6,3,3,0.083138,-0.317576,0.684211
6,Valtteri Bottas,7,11,4,0.032897,-0.190786,0.368421
7,Lewis Hamilton,8,4,4,-0.002002,-0.234915,0.347368
8,Esteban Ocon,9,10,1,-0.058379,-0.160456,0.094737
9,Lando Norris,10,9,1,-0.059756,-0.260997,0.242105


5.0


# GLiNER

In [59]:
import os
os.environ["HF_HOME"] = "C:\\cache"
from gliner import GLiNER

gliner_pickle_path = config.DATA_DIR / '.cache' / 'gliner_model.pkl'
gliner_pickle_path.parent.mkdir(parents=True, exist_ok=True)
use_cache = False

if use_cache:
    if not gliner_pickle_path.exists():
        gliner_model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

        with open(gliner_pickle_path, 'wb') as file:
            pickle.dump(gliner_model, file)
    else:
        with open(gliner_pickle_path, 'rb') as file:
            gliner_model = pickle.load(file)
else:
    gliner_model = GLiNER.from_pretrained('urchade/gliner_medium-v2.1')

gliner_model.to(DEVICE);

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]



In [60]:
print(type(gliner_model))

<class 'gliner.model.GLiNER'>


In [61]:
nlp = spacy.load('en_core_web_sm')
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'
debug = True

if debug:
    doc = nlp(text)
    df = pd.DataFrame({'text': tuple(sentence.text for sentence in doc.sents)})
else:
    df = load_f1_df(1000)

In [62]:
with display_full_dataframe():
    display(df.head())

def has_prediction(post_text: str, threshold: float = 0.45) -> bool:
    # doc = nlp(post_text)

    # TODO: does GLiNER's performance improve with more context? if yes, refactor to chunking instead of going over each sentence individually
    # for sentence in doc.sents:
    # TODO: for some reason, if you include only 'position', the predictions are far worse than with 'driver' included
    with torch.no_grad():
        entities = gliner_model.predict_entities(post_text, ('driver', 'position',), threshold=threshold) # TODO: very low threshold
    position_entities = tuple(entity for entity in entities if entity['label'] == 'position')

    if debug:
        print(position_entities)
        print(tuple(position['text'] for position in position_entities))

    if len(position_entities) != 0:
        return True

    return False


predictions_df = df[df['text'].apply(has_prediction)]
print(len(predictions_df))
with display_full_dataframe():
    display(predictions_df.head())

Unnamed: 0,text
0,"Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow!"
1,"I disagree with you, verstappening will definitely finish first."
2,"I think BOT will finish behind NOR, who will probably finish 7th."
3,That's my opinion at least...
4,I predict that the RedBulls with finish 1-2.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


({'start': 47, 'end': 52, 'text': 'top 3', 'label': 'position', 'score': 0.6188601851463318},)
('top 3',)
()
()
({'start': 61, 'end': 64, 'text': '7th', 'label': 'position', 'score': 0.8359587788581848},)
('7th',)
()
()
({'start': 40, 'end': 43, 'text': '1-2', 'label': 'position', 'score': 0.8649768829345703},)
('1-2',)
({'start': 65, 'end': 71, 'text': 'points', 'label': 'position', 'score': 0.5379745960235596},)
('points',)
({'start': 14, 'end': 20, 'text': 'podium', 'label': 'position', 'score': 0.6982569694519043}, {'start': 35, 'end': 41, 'text': 'points', 'label': 'position', 'score': 0.4657585620880127})
('podium', 'points')
()
()
5


Unnamed: 0,text
0,"Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow!"
2,"I think BOT will finish behind NOR, who will probably finish 7th."
4,I predict that the RedBulls with finish 1-2.
5,"Nah, the Danish driver from Haas will almost certainly finish in points!"
6,Stroll on the podium and Vettel in points.


In [75]:
text = 'Carlos Sainz is loving this upgraded car, good top 3 for the race tomorrow! I disagree with you, verstappening will definitely finish first. I think BOT will finish behind NOR, who will probably finish 7th. That\'s my opinion at least... I predict that the RedBulls with finish 1-2. Nah, the Danish driver from Haas will almost certainly finish in points! Stroll on the podium and Vettel in points. I like cookies!'
debug = False
import dask.dataframe as dd
import warnings
warnings.filterwarnings(
    "ignore",
    message="Sentence of length .* has been truncated to .*",
    category=UserWarning
)

if debug:
    doc = nlp(text)
    posts_df = pd.DataFrame({'text': tuple(sentence.text for sentence in doc.sents)})
else:
    start = time.perf_counter_ns()
    all_posts_df = load_f1_df()
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "load time")

# with display_full_dataframe():
#     display(hide_index(df.head()))

# df['text'] = df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
# df = df[df['text'].apply(has_prediction)]

def display_posts_df(n=3):
    global posts_df
    
    with display_full_dataframe():
        display(hide_index(posts_df.head(n)))

predictions_dict = {}
mae_list = []

for index, race_weekend in schedule.iterrows():
    
    print(index, race_weekend)
    #load relevant post
    first_post_at = typing.cast(dt.datetime, race_weekend['Session1DateUtc']) #- dt.timedelta(days=1)
    last_post_at = typing.cast(dt.datetime, race_weekend['Session5DateUtc'])
    posts_df = all_posts_df[
        (all_posts_df['created_utc'] >= first_post_at) &
        (all_posts_df['created_utc'] <= last_post_at)
    ]
    print('number of posts for event: ', len(posts_df))

    # spelling correction
    start = time.perf_counter_ns()
    posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "spell time")

    # only predictions
    start = time.perf_counter_ns()
    
    posts_df = posts_df[posts_df['text'].apply(has_prediction)]
    print('number of posts with prediction: ', len(posts_df))

    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "filter time")

    # sentiment score
    start = time.perf_counter_ns()
    comments = posts_df["text"].tolist()
    upvotes = posts_df["score"].tolist()
    sentiment = driver_sentiment(comments, F1_names, upvotes)
    scores = final_scores(sentiment)
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "sentiment time")

    # final prediction
    start = time.perf_counter_ns()
    prediction_df = prediction(index, scores, n_events=5, historical_score_contribution=0.4)
    mae = prediction_df["error"].mean()
    display(prediction_df)
    print("the MAE for predicted vs true position is: ", mae)
    end = time.perf_counter_ns()
    print((end - start) / 10 ** 9, "final pred time")

    predictions_dict[index] = prediction_df
    print("\n" * 3)
    

with pd.ExcelWriter(config.DATA_DIR /"predictions.xlsx") as writer:
    for key, df in predictions_dict.items():
        df.to_excel(writer, sheet_name=f"Iteration_{key}", index=False)

# loaded_predictions = pd.read_excel(config.DATA_DIR /"predictions.xlsx", sheet_name=None)  # Returns a dictionary of DataFrames



91.6175755 load time
9 RoundNumber                                             8
Country                                        Azerbaijan
Location                                             Baku
OfficialEventName    FORMULA 1 AZERBAIJAN GRAND PRIX 2022
EventDate                             2022-06-12 00:00:00
EventName                           Azerbaijan Grand Prix
EventFormat                                  conventional
Session1                                       Practice 1
Session1Date                    2022-06-10 15:00:00+04:00
Session1DateUtc                       2022-06-10 11:00:00
Session2                                       Practice 2
Session2Date                    2022-06-10 18:00:00+04:00
Session2DateUtc                       2022-06-10 14:00:00
Session3                                       Practice 3
Session3Date                    2022-06-11 15:00:00+04:00
Session3DateUtc                       2022-06-11 11:00:00
Session4                                       Qu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


484.4104127 spell time
number of posts with prediction:  4940
1487.9600493 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


93.9827281 sentiment time


Request for URL https://ergast.com/api/f1/2022/8/results.json failed; using cached response
Traceback (most recent call last):
  File "c:\Users\raf\Documents\GitHub\f1-subreddits-nlp\.venv\Lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "c:\Users\raf\Documents\GitHub\f1-subreddits-nlp\.venv\Lib\site-packages\urllib3\connection.py", line 507, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\http\client.py", line 1428, in getresponse
    response.begin()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\http\client.py", line 331, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsA

Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Sergio Perez,1,2,1,0.376278,0.044675,0.873684
1,Charles Leclerc,2,19,17,0.236763,0.120921,0.410526
2,Max Verstappen,3,1,2,0.10963,-0.217284,0.6
3,George Russell,4,3,1,0.045392,-0.380487,0.684211
4,Valtteri Bottas,5,11,6,-0.045014,-0.320638,0.368421
5,Pierre Gasly,6,5,1,-0.062014,0.044011,-0.221053
6,Sebastian Vettel,7,6,1,-0.084919,0.033907,-0.263158
7,Carlos Sainz,8,20,12,-0.092345,-0.203031,0.073684
8,Yuki Tsunoda,9,13,4,-0.152342,-0.134605,-0.178947
9,Esteban Ocon,10,10,0,-0.168242,-0.343562,0.094737


the MAE for predicted vs true position is:  4.5
30.7066831 final pred time




10 RoundNumber                                                9
Country                                               Canada
Location                                            Montréal
OfficialEventName    FORMULA 1 AWS GRAND PRIX DU CANADA 2022
EventDate                                2022-06-19 00:00:00
EventName                                Canadian Grand Prix
EventFormat                                     conventional
Session1                                          Practice 1
Session1Date                       2022-06-17 14:00:00-04:00
Session1DateUtc                          2022-06-17 18:00:00
Session2                                          Practice 2
Session2Date                       2022-06-17 17:00:00-04:00
Session2DateUtc                          2022-06-17 21:00:00
Session3                                          Practice 3
Session3Date                       2022-06-18 13:00:00-04:00
Ses

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


536.5389871 spell time
number of posts with prediction:  6047
1686.5113601 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


123.5654186 sentiment time


Request for URL https://ergast.com/api/f1/2022/9/results.json failed; using cached response
Traceback (most recent call last):
  File "c:\Users\raf\Documents\GitHub\f1-subreddits-nlp\.venv\Lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "c:\Users\raf\Documents\GitHub\f1-subreddits-nlp\.venv\Lib\site-packages\urllib3\connection.py", line 507, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\http\client.py", line 1428, in getresponse
    response.begin()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.2288.0_x64__qbz5n2kfra8p0\Lib\http\client.py", line 331, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsA

Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Max Verstappen,1,1,0,0.513441,0.217139,0.957895
1,Sergio Perez,2,20,18,0.320078,-0.048993,0.873684
2,Fernando Alonso,3,9,6,0.278517,0.471213,-0.010526
3,Mick Schumacher,4,19,15,0.224708,0.746443,-0.557895
4,George Russell,5,4,1,0.177256,-0.160714,0.684211
5,Alexander Albon,6,13,7,0.139342,0.44978,-0.326316
6,Lewis Hamilton,7,3,4,0.11892,-0.033379,0.347368
7,Daniel Ricciardo,8,11,3,0.112654,0.34916,-0.242105
8,Valtteri Bottas,9,7,2,0.106984,-0.025203,0.305263
9,Lando Norris,10,15,5,0.084727,0.035949,0.157895


the MAE for predicted vs true position is:  6.1
31.3143811 final pred time




11 RoundNumber                                                10
Country                                         Great Britain
Location                                          Silverstone
OfficialEventName    FORMULA 1 LENOVO BRITISH GRAND PRIX 2022
EventDate                                 2022-07-03 00:00:00
EventName                                  British Grand Prix
EventFormat                                      conventional
Session1                                           Practice 1
Session1Date                        2022-07-01 13:00:00+01:00
Session1DateUtc                           2022-07-01 12:00:00
Session2                                           Practice 2
Session2Date                        2022-07-01 16:00:00+01:00
Session2DateUtc                           2022-07-01 15:00:00
Session3                                           Practice 3
Session3Date                        2022-07-02 12:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


490.3779851 spell time
number of posts with prediction:  4322
1633.4089815 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


79.7981343 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Carlos Sainz,1,1,0,0.491282,0.517048,0.452632
1,Lewis Hamilton,2,3,1,0.308197,0.141731,0.557895
2,Max Verstappen,3,7,4,0.285079,-0.163464,0.957895
3,Nicholas Latifi,4,12,8,0.246678,0.740955,-0.494737
4,Sergio Perez,5,2,3,0.199362,0.002446,0.494737
5,Esteban Ocon,6,15,9,0.166329,0.143882,0.2
6,Pierre Gasly,7,16,9,0.154703,0.377136,-0.178947
7,Fernando Alonso,8,5,3,0.148895,0.114824,0.2
8,George Russell,9,18,9,0.145927,-0.212929,0.684211
9,Lance Stroll,10,11,1,0.124194,0.382429,-0.263158


the MAE for predicted vs true position is:  5.0
6.2453158 final pred time




13 RoundNumber                                                  12
Country                                                  France
Location                                           Le Castellet
OfficialEventName    FORMULA 1 LENOVO GRAND PRIX DE FRANCE 2022
EventDate                                   2022-07-24 00:00:00
EventName                                     French Grand Prix
EventFormat                                        conventional
Session1                                             Practice 1
Session1Date                          2022-07-22 14:00:00+02:00
Session1DateUtc                             2022-07-22 12:00:00
Session2                                             Practice 2
Session2Date                          2022-07-22 17:00:00+02:00
Session2DateUtc                             2022-07-22 15:00:00
Session3                                             Practice 3
Session3Date           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


329.6218303 spell time
number of posts with prediction:  3653
1017.0384341 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


69.2208057 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Max Verstappen,1,1,0,0.318022,-0.010315,0.810526
1,Charles Leclerc,2,19,17,0.197547,0.05556,0.410526
2,Sergio Perez,3,4,1,0.190526,0.21228,0.157895
3,Lewis Hamilton,4,2,2,0.181264,-0.139998,0.663158
4,George Russell,5,3,2,0.127756,-0.046723,0.389474
5,Mick Schumacher,6,15,9,0.08315,0.328056,-0.284211
6,Carlos Sainz,7,5,2,0.054932,-0.04178,0.2
7,Lando Norris,8,7,1,0.013661,-0.110565,0.2
8,Daniel Ricciardo,9,9,0,-0.011409,0.002037,-0.031579
9,Valtteri Bottas,10,14,4,-0.046945,-0.043154,-0.052632


the MAE for predicted vs true position is:  4.2
7.5228592 final pred time




14 RoundNumber                                            13
Country                                           Hungary
Location                                         Budapest
OfficialEventName    FORMULA 1 ARAMCO MAGYAR NAGYDÍJ 2022
EventDate                             2022-07-31 00:00:00
EventName                            Hungarian Grand Prix
EventFormat                                  conventional
Session1                                       Practice 1
Session1Date                    2022-07-29 14:00:00+02:00
Session1DateUtc                       2022-07-29 12:00:00
Session2                                       Practice 2
Session2Date                    2022-07-29 17:00:00+02:00
Session2DateUtc                       2022-07-29 15:00:00
Session3                                       Practice 3
Session3Date                    2022-07-30 13:00:00+02:00
Session3DateUtc                       2022-07-30 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


462.0566947 spell time
number of posts with prediction:  5298
1477.680807 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


115.870222 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,George Russell,1,3,2,0.504376,0.552907,0.431579
1,Lewis Hamilton,2,2,0,0.321756,0.009945,0.789474
2,Max Verstappen,3,1,2,0.312698,-0.047258,0.852632
3,Fernando Alonso,4,8,4,0.274261,0.239559,0.326316
4,Esteban Ocon,5,9,4,0.157551,0.143287,0.178947
5,Alexander Albon,6,17,11,0.105955,0.422206,-0.368421
6,Carlos Sainz,7,4,3,0.01318,-0.069261,0.136842
7,Lando Norris,8,7,1,-0.017838,-0.149028,0.178947
8,Sergio Perez,9,5,4,-0.03122,-0.115192,0.094737
9,Charles Leclerc,10,6,4,-0.034733,-0.121046,0.094737


the MAE for predicted vs true position is:  3.6
5.9029275 final pred time




15 RoundNumber                                               14
Country                                              Belgium
Location                                   Spa-Francorchamps
OfficialEventName    FORMULA 1 ROLEX BELGIAN GRAND PRIX 2022
EventDate                                2022-08-28 00:00:00
EventName                                 Belgian Grand Prix
EventFormat                                     conventional
Session1                                          Practice 1
Session1Date                       2022-08-26 14:00:00+02:00
Session1DateUtc                          2022-08-26 12:00:00
Session2                                          Practice 2
Session2Date                       2022-08-26 17:00:00+02:00
Session2DateUtc                          2022-08-26 15:00:00
Session3                                          Practice 3
Session3Date                       2022-08-27 13:00:00+02:00
Sess

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


409.9977369 spell time
number of posts with prediction:  4876
1279.7794865 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


102.0445271 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Lewis Hamilton,1,20,19,0.46096,0.213882,0.831579
1,Max Verstappen,2,1,1,0.398075,0.095037,0.852632
2,George Russell,3,4,1,0.205312,0.054468,0.431579
3,Carlos Sainz,4,3,1,0.197474,0.013334,0.473684
4,Lando Norris,5,12,7,0.173132,0.141186,0.221053
5,Fernando Alonso,6,5,1,0.142517,0.03402,0.305263
6,Alexander Albon,7,10,3,0.125687,0.525268,-0.473684
7,Mick Schumacher,8,17,9,0.120719,0.334531,-0.2
8,Esteban Ocon,9,7,2,0.016806,-0.105323,0.2
9,Lance Stroll,10,11,1,0.006466,0.045864,-0.052632


the MAE for predicted vs true position is:  5.3
5.4482897 final pred time




16 RoundNumber                                                15
Country                                           Netherlands
Location                                            Zandvoort
OfficialEventName    FORMULA 1 HEINEKEN DUTCH GRAND PRIX 2022
EventDate                                 2022-09-04 00:00:00
EventName                                    Dutch Grand Prix
EventFormat                                      conventional
Session1                                           Practice 1
Session1Date                        2022-09-02 12:30:00+02:00
Session1DateUtc                           2022-09-02 10:30:00
Session2                                           Practice 2
Session2Date                        2022-09-02 16:00:00+02:00
Session2DateUtc                           2022-09-02 14:00:00
Session3                                           Practice 3
Session3Date                        2022-09-03 12:0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


489.541816 spell time
number of posts with prediction:  5645
1548.5187807 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


112.1965997 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Max Verstappen,1,1,0,0.542861,0.336347,0.852632
1,Lando Norris,2,7,5,0.220536,0.178086,0.284211
2,George Russell,3,2,1,0.185459,0.021379,0.431579
3,Carlos Sainz,4,8,4,0.153366,-0.046145,0.452632
4,Esteban Ocon,5,9,4,-0.022147,-0.15621,0.178947
5,Mick Schumacher,6,13,7,-0.02465,0.06418,-0.157895
6,Charles Leclerc,7,3,4,-0.077496,-0.360739,0.347368
7,Pierre Gasly,8,11,3,-0.086541,0.017168,-0.242105
8,Fernando Alonso,9,6,3,-0.126117,-0.469844,0.389474
9,Lewis Hamilton,10,4,6,-0.131806,-0.535467,0.473684


the MAE for predicted vs true position is:  3.3
5.1770148 final pred time




17 RoundNumber                                                   16
Country                                                    Italy
Location                                                   Monza
OfficialEventName    FORMULA 1 PIRELLI GRAN PREMIO D’ITALIA 2022
EventDate                                    2022-09-11 00:00:00
EventName                                     Italian Grand Prix
EventFormat                                         conventional
Session1                                              Practice 1
Session1Date                           2022-09-09 14:00:00+02:00
Session1DateUtc                              2022-09-09 12:00:00
Session2                                              Practice 2
Session2Date                           2022-09-09 17:00:00+02:00
Session2DateUtc                              2022-09-09 15:00:00
Session3                                              Practice 3
Session3D

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


484.681356 spell time
number of posts with prediction:  6043
1444.9464709 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


137.9143973 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Charles Leclerc,1,2.0,1.0,0.342191,0.324704,0.368421
1,Sebastian Vettel,2,20.0,18.0,0.330749,0.656511,-0.157895
2,Carlos Sainz,3,4.0,1.0,0.302658,0.300921,0.305263
3,Max Verstappen,4,1.0,3.0,0.290794,-0.167976,0.978947
4,George Russell,5,3.0,2.0,0.116823,-0.317576,0.768421
5,Nyck De Vries,6,9.0,3.0,0.040996,0.068327,0.0
6,Lewis Hamilton,7,5.0,2.0,0.040104,-0.234915,0.452632
7,Esteban Ocon,8,11.0,3.0,0.025832,-0.160456,0.305263
8,Fernando Alonso,9,19.0,10.0,-0.033286,-0.301091,0.368421
9,Mick Schumacher,10,12.0,2.0,-0.051113,0.09025,-0.263158


the MAE for predicted vs true position is:  4.95
6.7275044 final pred time




18 RoundNumber                                                         17
Country                                                      Singapore
Location                                                    Marina Bay
OfficialEventName    FORMULA 1 SINGAPORE AIRLINES SINGAPORE GRAND P...
EventDate                                          2022-10-02 00:00:00
EventName                                         Singapore Grand Prix
EventFormat                                               conventional
Session1                                                    Practice 1
Session1Date                                 2022-09-30 18:00:00+08:00
Session1DateUtc                                    2022-09-30 10:00:00
Session2                                                    Practice 2
Session2Date                                 2022-09-30 21:00:00+08:00
Session2DateUtc                                    2022-09-30 13:0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


454.9968806 spell time
number of posts with prediction:  4221
1318.605746 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


75.971078 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Sergio Perez,1,1.0,0.0,0.27982,0.038297,0.642105
1,Max Verstappen,2,7.0,5.0,0.273891,-0.210181,1.0
2,Carlos Sainz,3,3.0,0.0,0.249688,0.016147,0.6
3,Lando Norris,4,4.0,0.0,0.228648,0.205642,0.263158
4,Lewis Hamilton,5,9.0,4.0,0.137557,-0.044423,0.410526
5,Charles Leclerc,6,2.0,4.0,0.113951,-0.041661,0.347368
6,Daniel Ricciardo,7,5.0,2.0,0.100281,0.454854,-0.431579
7,George Russell,8,14.0,6.0,0.057481,-0.430515,0.789474
8,Fernando Alonso,9,18.0,9.0,0.000557,-0.118371,0.178947
9,Esteban Ocon,10,16.0,6.0,-0.001672,-0.122085,0.178947


the MAE for predicted vs true position is:  4.55
6.4128673 final pred time




19 RoundNumber                                                18
Country                                                 Japan
Location                                               Suzuka
OfficialEventName    FORMULA 1 HONDA JAPANESE GRAND PRIX 2022
EventDate                                 2022-10-09 00:00:00
EventName                                 Japanese Grand Prix
EventFormat                                      conventional
Session1                                           Practice 1
Session1Date                        2022-10-07 12:00:00+09:00
Session1DateUtc                           2022-10-07 03:00:00
Session2                                           Practice 2
Session2Date                        2022-10-07 15:00:00+09:00
Session2DateUtc                           2022-10-07 06:00:00
Session3                                           Practice 3
Session3Date                        2022-10-08 12:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


332.1612929 spell time
number of posts with prediction:  3352
1035.9640316 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


77.554324 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Sebastian Vettel,1,6.0,5.0,0.424316,0.812457,-0.157895
1,Sergio Perez,2,2.0,0.0,0.205663,-0.127403,0.705263
2,Max Verstappen,3,1.0,2.0,0.164616,-0.308097,0.873684
3,Nyck De Vries,4,,,0.107974,0.158904,0.031579
4,Charles Leclerc,5,3.0,2.0,0.029413,-0.421153,0.705263
5,Pierre Gasly,6,18.0,12.0,0.020418,-0.001057,0.052632
6,Lewis Hamilton,7,5.0,2.0,0.001431,-0.173054,0.263158
7,Esteban Ocon,8,4.0,4.0,-0.009688,-0.023164,0.010526
8,Lando Norris,9,10.0,1.0,-0.010423,-0.234915,0.326316
9,Fernando Alonso,10,7.0,3.0,-0.018058,0.019025,-0.073684


the MAE for predicted vs true position is:  4.15
5.0748203 final pred time




20 RoundNumber                                                      19
Country                                               United States
Location                                                     Austin
OfficialEventName    FORMULA 1 ARAMCO UNITED STATES GRAND PRIX 2022
EventDate                                       2022-10-23 00:00:00
EventName                                  United States Grand Prix
EventFormat                                            conventional
Session1                                                 Practice 1
Session1Date                              2022-10-21 14:00:00-05:00
Session1DateUtc                                 2022-10-21 19:00:00
Session2                                                 Practice 2
Session2Date                              2022-10-21 17:00:00-05:00
Session2DateUtc                                 2022-10-21 22:00:00
Session3                          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


348.9923451 spell time
number of posts with prediction:  3155
1042.7880554 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


52.8492954 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Sergio Perez,1,4.0,3.0,0.621123,0.522924,0.768421
1,Carlos Sainz,2,20.0,18.0,0.430621,0.500158,0.326316
2,Max Verstappen,3,1.0,2.0,0.34873,-0.001239,0.873684
3,Sebastian Vettel,4,8.0,4.0,0.33783,0.612174,-0.073684
4,Charles Leclerc,5,3.0,2.0,0.292163,-0.025343,0.768421
5,Lando Norris,6,6.0,0.0,0.274814,0.282585,0.263158
6,Pierre Gasly,7,14.0,7.0,0.248063,0.46256,-0.073684
7,Lance Stroll,8,18.0,10.0,0.129991,0.27981,-0.094737
8,Fernando Alonso,9,7.0,2.0,0.118215,0.232112,-0.052632
9,Esteban Ocon,10,11.0,1.0,0.106652,0.10056,0.115789


the MAE for predicted vs true position is:  5.35
7.4780905 final pred time




21 RoundNumber                                                         20
Country                                                         Mexico
Location                                                   Mexico City
OfficialEventName    FORMULA 1 HEINEKEN GRAN PREMIO DE LA CIUDAD DE...
EventDate                                          2022-10-30 00:00:00
EventName                                       Mexico City Grand Prix
EventFormat                                               conventional
Session1                                                    Practice 1
Session1Date                                 2022-10-28 13:00:00-06:00
Session1DateUtc                                    2022-10-28 19:00:00
Session2                                                    Practice 2
Session2Date                                 2022-10-28 16:00:00-06:00
Session2DateUtc                                    2022-10-28 22:0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


403.3533893 spell time
number of posts with prediction:  3458
1172.3092808 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


74.9522495 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Esteban Ocon,1,8.0,7.0,0.452332,0.732834,0.031579
1,Max Verstappen,2,1.0,1.0,0.436404,0.144883,0.873684
2,Sergio Perez,3,3.0,0.0,0.237523,-0.088339,0.726316
3,Charles Leclerc,4,6.0,2.0,0.175471,-0.261934,0.831579
4,Lewis Hamilton,5,2.0,3.0,0.085359,-0.2437,0.578947
5,George Russell,6,4.0,2.0,0.055333,-0.195497,0.431579
6,Sebastian Vettel,7,14.0,7.0,0.006955,0.060714,-0.073684
7,Nyck De Vries,8,,,-0.014663,-0.04549,0.031579
8,Lando Norris,9,9.0,0.0,-0.02084,-0.294383,0.389474
9,Lance Stroll,10,15.0,5.0,-0.02331,0.122554,-0.242105


the MAE for predicted vs true position is:  3.75
6.317677 final pred time




23 RoundNumber                                                         22
Country                                                      Abu Dhabi
Location                                                    Yas Island
OfficialEventName    FORMULA 1 ETIHAD AIRWAYS ABU DHABI GRAND PRIX ...
EventDate                                          2022-11-20 00:00:00
EventName                                         Abu Dhabi Grand Prix
EventFormat                                               conventional
Session1                                                    Practice 1
Session1Date                                 2022-11-18 14:00:00+04:00
Session1DateUtc                                    2022-11-18 10:00:00
Session2                                                    Practice 2
Session2Date                                 2022-11-18 17:00:00+04:00
Session2DateUtc                                    2022-11-18 13:00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posts_df['text'] = posts_df['text'].apply(preprocessing.correct_spelling_in_text_spacy)


357.9886337 spell time
number of posts with prediction:  4098
1086.479302 filter time


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


85.0607304 sentiment time


Unnamed: 0,driver_name,predicted_position,true_position,error,combined_score,sentiment_score,historical_score
0,Max Verstappen,1,1,0,0.318833,0.019108,0.768421
1,Sergio Perez,2,3,1,0.259843,-0.065173,0.747368
2,Lance Stroll,3,8,5,0.160793,0.387287,-0.178947
3,Charles Leclerc,4,2,2,0.158724,-0.219671,0.726316
4,George Russell,5,5,0,0.045456,-0.211959,0.431579
5,Esteban Ocon,6,7,1,0.027094,-0.032037,0.115789
6,Daniel Ricciardo,7,9,2,0.026455,0.13532,-0.136842
7,Valtteri Bottas,8,15,7,0.010971,0.179688,-0.242105
8,Fernando Alonso,9,20,11,-0.001076,0.047329,-0.073684
9,Lando Norris,10,6,4,-0.003051,-0.082278,0.115789


the MAE for predicted vs true position is:  3.8
11.7538665 final pred time






In [70]:
# nececary: pip install openpyxl
# with pd.ExcelWriter(config.DATA_DIR /"predictions.xlsx") as writer:
#     for key, df in predictions_dict.items():
#         df.to_excel(writer, sheet_name=f"Iteration_{key}", index=False)

In [44]:
loaded_predictions = pd.read_excel(config.DATA_DIR /"predictions.xlsx", sheet_name=None)
total_error = 0
total_count = 0

for sheet_name, df in loaded_predictions.items():
    total_error += df['error'].sum()
    total_count += 20

overall_mae = total_error / total_count

print(f"Overall MAE: {overall_mae}")


Overall MAE: 4.503846153846154


In [None]:
# sentiment score
start = time.perf_counter_ns()
comments = posts_df["text"].tolist()
upvotes = posts_df["score"].tolist()
sentiment = driver_sentiment(comments, F1_names, upvotes)
scores = final_scores(sentiment)
end = time.perf_counter_ns()
print((end - start) / 10 ** 9, "sentiment time")
# print("Drivers ranked by (positive - negative score):")
# for driver, score in scores:
#     print(f"{driver}: {score:.4f}")

# final prediction
start = time.perf_counter_ns()
pred = prediction(index, scores, n_events=5, historical_score_contribution=0.75)
end = time.perf_counter_ns()
print((end - start) / 10 ** 9, "final pred time")


predictions_dict[index] = pred
position = 0
for driver, score in pred:
    position += 1
    print(f"{driver} finishes in position:{position}      {score:.4f}")

In [78]:
validation_posts = load_f1_df(2000)['text']
display(validation_posts.head())
validation_posts = validation_posts.apply(preprocessing.correct_spelling_in_text_spacy)
validation_posts = validation_posts[validation_posts.apply(has_prediction)]
print(len(validation_posts))
with display_full_dataframe():
    display(validation_posts.head())

validation_posts.to_csv(config.DATA_DIR /'file1.csv')
    

0    [Discussion] Could professional ESports driver...
1      Questions concerning Alonso's future. [removed]
2    Verstappen now has as many poles as Leclerc - ...
3    Perez wins as Red Bull delivers race strategy ...
4    The "new" qualifying since (I think 2021?) 202...
Name: text, dtype: object

435


5                                  [joe award] sources saying that peter layer has gone from his position as head of F1 at ﻿the max verstappen...checking now. 
13                                                                                                       2022 monaco of odis F1 face debrief - mercedes-AMGF1. 
34    remember this team, check and esteban ocon used to be star for this team before ﻿the downfall. a bought this limited edition in 2012 INDIAN of. [deleted]
38                                                                   wolff explains why mercedes ton't a-turn on W13 stuck in 'no man's land' | RacingNews365. 
40                         sebastian vettel delighted not to leave monaco 'empty handed' after P10 finish, as lance stroll takes positives from 'tricky race ' 
Name: text, dtype: object

In [None]:
import ast

gpt_df = pd.read_csv(config.DATA_DIR / 'validation_labels' /'sentiment_analysis_output.csv')
display(gpt_df.head())

error = []

for i in range(len(gpt_df)):
    comment = [gpt_df["text"].iloc[i]]
    sentiment_str = gpt_df["sentiments"].iloc[i]
    score = [1]

    if isinstance(sentiment_str, str):
        try:
            sentiment = ast.literal_eval(sentiment_str)
        except (ValueError, SyntaxError):
            raise ValueError(f"Invalid dictionary format in row {i}: {sentiment_str}")
    else:
        sentiment = sentiment_str

    results = driver_sentiment(comment, F1_names, score)

    model_score = final_scores(results)
    gpt_score = final_scores(sentiment)

    model_score = sorted(model_score, key=lambda x: x[0])
    gpt_score = sorted(gpt_score, key=lambda x: x[0])

    # Calculate the absolute error for each driver
    for (driver_model, score_model), (driver_gpt, score_gpt) in zip(model_score, gpt_score):
        # if abs(score_model - score_gpt) >= 0.5:
        #     print(driver_model, score_model)
        #     print(driver_gpt, score_gpt)
        #     print(comment)
        if driver_model == driver_gpt:
            error.append(abs(score_model - score_gpt))
        elif driver_gpt != None:
            error.append(abs(score_gpt))
        else:
            error.append(abs(score_model))

print(error)
mae = sum(error) / len(error)
print(mae)

Unnamed: 0.1,Unnamed: 0,text,sentiments
0,5,[joe award] sources saying that peter layer ha...,{'max verstappen': {'positive': 0.403772960701...
1,13,2022 monaco of odis F1 face debrief - mercedes...,{}
2,34,"remember this team, check and esteban ocon use...",{'esteban ocon': {'positive': 0.39354195972687...
3,38,wolff explains why mercedes ton't a-turn on W1...,{}
4,40,sebastian vettel delighted not to leave monaco...,{'lance stroll': {'positive': 0.18186713467381...


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cuda:0
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cuda:0
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cuda:0
Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Device set to use cu

[0.35830570338929535, 0.20010630667677304, 0.061930222717736017, 0.9281061546037164, 0.06475281509496339, 0.2806137224944615, 0.8206749294642313, 0.572626201325576, 0.9791685725970545, 0.07019391826146212, 0.9365414868839482, 0.14037259260984225, 0.4196744110526204, 0.5250916419236265, 0.24456256895681397, 0.41125958480799074, 0.041386549078154033, 0.07270913928383432, 0.4233533799586791, 0.12499013128645914, 0.7274522977741819, 0.6261146062960203, 0.9731661499530903, 0.2674053968232901, 0.08105756343518622, 1.0544863816575436, 0.4257876970753713, 0.8777928190376841, 0.40077767640885464, 0.8749320093777258, 1.216304481893015, 0.8358491881398571, 1.168821119166843, 0.38150154894363236, 0.2213624959738328, 0.24089115208939232, 0.24397105206677427, 0.047847557806152685, 0.006556391337724954, 0.4254495328472977, 0.045416861340499104, 0.36051511402883957, 0.04869628860113348, 0.05058060188240596, 0.278060452273399, 0.1233408966270425, 0.11705408677296297, 0.03143107055580313, 0.497211547271