In [60]:
from pprint import pprint
from collections.abc import Generator, Callable
from pathlib import Path
import typing
from typing import Any, TypeAlias
import pandas as pd
import numpy as np
import datetime as dt
import re
from functools import partial, reduce
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

import importlib

import spacy
from nltk.tokenize import word_tokenize as tokenize_nltk
import nltk
nltk.download('punkt_tab')

from config.fastf1 import fastf1
from config import config
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
from src.data import preprocessing
importlib.reload(preprocessing)
import src.data.constants as dataset_constants

from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
f1_ndjson_streamer = partial(stream_ndjson, limit=3000000)
#f1_ndjson_streamer = partial(stream_ndjson)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f15_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS)
f15_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS)

In [62]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

### r/formula1 submissions:

Unnamed: 0,author,selftext,gilded,title,id,score,created_utc
0,[deleted],[removed],0,[Discussion] Could professional ESports drivers drive a real F1 car? How realistic are the sims?,v2fbpg,1,2022-06-01 12:00:41
1,Doomaster14,[removed],0,Questions concerning Alonso's future,v2fh6w,2,2022-06-01 12:07:50
2,motorace_addict,,0,Verstappen now has as many poles as Leclerc - but six times as many wins | 2022 Monaco Grand Prix stats and facts,v2fmeh,1393,2022-06-01 12:15:14
3,MrTuxedo1,,0,Perez wins as Red Bull delivers race strategy blow to Ferrari - Mika Häkkinen’s thoughts on the Monaco Grand Prix,v2frea,161,2022-06-01 12:23:16


### r/formula1 comments:

Unnamed: 0,author,gilded,id,created_utc,score,body
0,CowsWantToKillMe,0,iaq4tev,2022-06-01 00:00:57,1,top part of the wing got shaken off in the tunnel.
1,doc_55lk,0,iaq4urr,2022-06-01 00:01:15,0,That's been the rumour with Mercedes lately cuz in previous seasons Bottas hasn't been the luckiest.
2,Organic-Measurement2,0,iaq4wpz,2022-06-01 00:01:41,3,"Ah well, it's looking great already!"
3,not_right,0,iaq4x1h,2022-06-01 00:01:46,10,And Ferrari would get them all wrong.


### r/formula1point5 submissions:

Unnamed: 0,author,selftext,gilded,title,id,score,created_utc
0,orfeomclaren,,0,Formula 1 - Hakkinen vs Schumacher - Spa-Francorchamps 2000,v6qyud,1,2022-06-07 09:21:41
1,orfeomclaren,,0,Formula 1 2003 - Rd 2 - Malaysian Grand Prix [Highlights] - Kimi Raikkonen Maiden Win,v6viae,1,2022-06-07 13:26:25
2,orfeomclaren,,0,Formula 1 2003 - Rd 9 - European Grand Prix (Nurburgring) [Highlights],v8bwj6,1,2022-06-09 08:12:22
3,ms_creativity,,0,Red Bull drivers free to fight each other,v8f1dk,1,2022-06-09 11:48:11


### r/formula1point5 comments:

Unnamed: 0,author,gilded,id,created_utc,score,body
0,debrek,0,iaqwofj,2022-06-01 03:50:49,3,What is your team name please?
1,IgnisVizsla,0,iar7xgu,2022-06-01 05:54:28,2,"It's lazily named team F1.5 and my name there is the same as my username here (Ignis Vizsla), I'm 34th on the leaderboard there for reference"
2,debrek,0,iar9z0m,2022-06-01 06:20:29,3,I had removed you as I thought you were inactive since you had a number of teams with an invalid team. I re-added you to the list.
3,IgnisVizsla,0,iarc3x7,2022-06-01 06:49:13,3,"Yeah that's my fault, I forgot to update my team after the rules changed as I always remembered only after quali and that was too late, I finally changed before Monaco though"


In [63]:
f1_df = preprocessing.concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = preprocessing.concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

n = 3

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

### r/formula1 posts:

Unnamed: 0,author,gilded,id,score,created_utc,text
0,[deleted],0,v2fbpg,1,2022-06-01 12:00:41,[Discussion] Could professional ESports drivers drive a real F1 car? How realistic are the sims? [removed]
1,Doomaster14,0,v2fh6w,2,2022-06-01 12:07:50,Questions concerning Alonso's future. [removed]
2,motorace_addict,0,v2fmeh,1393,2022-06-01 12:15:14,Verstappen now has as many poles as Leclerc - but six times as many wins | 2022 Monaco Grand Prix stats and facts.


### r/formula1point5 posts:

Unnamed: 0,author,gilded,id,score,created_utc,text
0,orfeomclaren,0,v6qyud,1,2022-06-07 09:21:41,Formula 1 - Hakkinen vs Schumacher - Spa-Francorchamps 2000.
1,orfeomclaren,0,v6viae,1,2022-06-07 13:26:25,Formula 1 2003 - Rd 2 - Malaysian Grand Prix [Highlights] - Kimi Raikkonen Maiden Win.
2,orfeomclaren,0,v8bwj6,1,2022-06-09 08:12:22,Formula 1 2003 - Rd 9 - European Grand Prix (Nurburgring) [Highlights]


In [64]:
# Imports

import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
import math


In [65]:
# Prepare Data

processed_testdata = [
    ['ricciardo', 'to', 'red', 'bull'],
    ['hamilton', 'to', 'stay', 'mercedes'],
    ['alonso', 'to', 'aston', 'martin'],
    ['max', 'verstappen', 'to', 'ferrari'],
    ['max', 'verstappen', 'stay', 'red', 'bull'],
    ['max', 'verstappen', 'stay', 'by', 'red', 'bull']
]

processed_data = [
    ['max', 'verstappen', 'joins', 'red', 'bull'],
    ['charles', 'leclerc', 'signs', 'with', 'ferrari'],
    ['sergio', 'perez', 'moves', 'to', 'red', 'bull'],
    ['george', 'russell', 'stays', 'at', 'mercedes'],
    ['carlos', 'sainz', 'extends', 'his', 'deal', 'with', 'ferrari'],
    ['lewis', 'hamilton', 'retires', 'from', 'formula', 'one'],
    ['lando', 'norris', 'leaves', 'mclaren', 'for', 'ferrari'],
    ['esteban', 'ocon', 'promoted', 'to', 'alpine', 'renault'],
    ['fernando', 'alonso', 'joins', 'aston', 'martin'],
    ['valtteri', 'bottas', 'shifts', 'to', 'alfa', 'romeo'],
    ['daniel', 'ricciardo', 'returns', 'to', 'red', 'bull'],
    ['sebastian', 'vettel', 'steps', 'back', 'from', 'racing'],
    ['kevin', 'magnussen', 'remains', 'at', 'haas'],
    ['pierre', 'gasly', 'partners', 'with', 'alphatauri'],
    ['lance', 'stroll', 'stays', 'at', 'aston', 'martin'],
    ['mick', 'schumacher', 'leaves', 'haas', 'for', 'alfa', 'romeo'],
    ['yuki', 'tsunoda', 'signs', 'new', 'contract', 'with', 'alphatauri'],
    ['zhou', 'guanyu', 'extends', 'stay', 'at', 'alfa', 'romeo'],
    ['alexander', 'albon', 'linked', 'to', 'red', 'bull', 'return'],
    ['nicholas', 'latifi', 'retires', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'replaces', 'perez', 'at', 'red', 'bull'],
    ['nico', 'hulkenberg', 'joins', 'haas'],
    ['oscar', 'piastri', 'promoted', 'to', 'mclaren'],
    ['liam', 'lawson', 'signs', 'with', 'alphatauri'],
    ['logan', 'sargeant', 'commits', 'to', 'williams'],
    ['max', 'verstappen', 'leaves', 'red', 'bull', 'for', 'ferrari'],
    ['charles', 'leclerc', 'moves', 'to', 'mercedes'],
    ['sergio', 'perez', 'replaces', 'alonso', 'at', 'aston', 'martin'],
    ['george', 'russell', 'partners', 'with', 'hamilton', 'at', 'mercedes'],
    ['carlos', 'sainz', 'remains', 'loyal', 'to', 'ferrari'],
    ['lewis', 'hamilton', 'extends', 'his', 'deal', 'with', 'mercedes'],
    ['lando', 'norris', 'joins', 'red', 'bull'],
    ['esteban', 'ocon', 'stays', 'at', 'alpine'],
    ['fernando', 'alonso', 'retires', 'from', 'formula', 'one'],
    ['valtteri', 'bottas', 'switches', 'to', 'sauber'],
    ['daniel', 'ricciardo', 'linked', 'with', 'mclaren', 'return'],
    ['sebastian', 'vettel', 'signs', 'with', 'aston', 'martin'],
    ['kevin', 'magnussen', 'leaves', 'haas', 'for', 'red', 'bull'],
    ['pierre', 'gasly', 'extends', 'stay', 'with', 'alpine', 'renault'],
    ['lance', 'stroll', 'moves', 'to', 'aston', 'martin'],
    ['mick', 'schumacher', 'replaces', 'hulkenberg', 'at', 'haas'],
    ['yuki', 'tsunoda', 'extends', 'contract', 'with', 'alphatauri'],
    ['zhou', 'guanyu', 'moves', 'to', 'mclaren'],
    ['alexander', 'albon', 'joins', 'williams'],
    ['nicholas', 'latifi', 'steps', 'back', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'partners', 'with', 'red', 'bull'],
    ['nico', 'hulkenberg', 'signs', 'with', 'haas'],
    ['oscar', 'piastri', 'replaces', 'norris', 'at', 'mclaren'],
    ['liam', 'lawson', 'commits', 'to', 'alphatauri'],
    ['logan', 'sargeant', 'linked', 'to', 'williams'],
    ['max', 'verstappen', 'signs', 'long-term', 'deal', 'with', 'red', 'bull'],
    ['charles', 'leclerc', 'remains', 'at', 'ferrari'],
    ['sergio', 'perez', 'steps', 'away', 'from', 'formula', 'one'],
    ['george', 'russell', 'linked', 'to', 'ferrari'],
    ['carlos', 'sainz', 'moves', 'to', 'aston', 'martin'],
    ['lewis', 'hamilton', 'joins', 'ferrari'],
    ['lando', 'norris', 'replaces', 'russell', 'at', 'mercedes'],
    ['esteban', 'ocon', 'returns', 'to', 'alpine'],
    ['fernando', 'alonso', 'signs', 'with', 'aston', 'martin'],
    ['valtteri', 'bottas', 'retires', 'from', 'formula', 'one'],
    ['daniel', 'ricciardo', 'replaces', 'verstappen', 'at', 'red', 'bull'],
    ['sebastian', 'vettel', 'joins', 'sauber'],
    ['kevin', 'magnussen', 'linked', 'to', 'ferrari'],
    ['pierre', 'gasly', 'remains', 'with', 'alpine'],
    ['lance', 'stroll', 'stays', 'with', 'aston', 'martin'],
    ['mick', 'schumacher', 'joins', 'alfa', 'romeo'],
    ['yuki', 'tsunoda', 'remains', 'with', 'alphatauri'],
    ['zhou', 'guanyu', 'linked', 'to', 'sauber'],
    ['alexander', 'albon', 'extends', 'deal', 'with', 'williams'],
    ['nicholas', 'latifi', 'leaves', 'formula', 'one'],
    ['nyck', 'vries', 'replaces', 'alonso', 'at', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'partners', 'with', 'haas'],
    ['oscar', 'piastri', 'stays', 'at', 'mclaren'],
    ['liam', 'lawson', 'signs', 'with', 'alphatauri'],
    ['logan', 'sargeant', 'promoted', 'to', 'williams'],
    ['max', 'verstappen', 'leaves', 'red', 'bull', 'for', 'mercedes'],
    ['charles', 'leclerc', 'joins', 'red', 'bull'],
    ['sergio', 'perez', 'partners', 'with', 'aston', 'martin'],
    ['george', 'russell', 'replaces', 'hamilton', 'at', 'mercedes'],
    ['carlos', 'sainz', 'signs', 'new', 'contract', 'with', 'ferrari'],
    ['lewis', 'hamilton', 'extends', 'deal', 'with', 'mercedes'],
    ['lando', 'norris', 'switches', 'to', 'ferrari'],
    ['esteban', 'ocon', 'remains', 'at', 'alpine'],
    ['fernando', 'alonso', 'linked', 'to', 'aston', 'martin'],
    ['valtteri', 'bottas', 'signs', 'with', 'sauber'],
    ['daniel', 'ricciardo', 'commits', 'to', 'red', 'bull'],
    ['sebastian', 'vettel', 'retires', 'from', 'formula', 'one'],
    ['kevin', 'magnussen', 'remains', 'with', 'haas'],
    ['pierre', 'gasly', 'moves', 'to', 'ferrari'],
    ['lance', 'stroll', 'partners', 'with', 'alonso', 'at', 'aston', 'martin'],
    ['mick', 'schumacher', 'linked', 'to', 'mclaren'],
    ['yuki', 'tsunoda', 'promoted', 'to', 'red', 'bull'],
    ['zhou', 'guanyu', 'replaces', 'gasly', 'at', 'alpine'],
    ['alexander', 'albon', 'stays', 'at', 'williams'],
    ['nicholas', 'latifi', 'moves', 'to', 'haas'],
    ['nyck', 'vries', 'partners', 'with', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'linked', 'to', 'alfa', 'romeo'],
    ['oscar', 'piastri', 'joins', 'red', 'bull'],
    ['liam', 'lawson', 'replaces', 'alonso', 'at', 'aston', 'martin'],
    ['logan', 'sargeant', 'signs', 'with', 'alphatauri'],
    ['max', 'verstappen', 'switches', 'to', 'alpine'],
    ['charles', 'leclerc', 'returns', 'to', 'ferrari'],
    ['sergio', 'perez', 'leaves', 'formula', 'one'],
    ['george', 'russell', 'remains', 'at', 'mercedes'],
    ['carlos', 'sainz', 'commits', 'to', 'ferrari'],
    ['lewis', 'hamilton', 'partners', 'with', 'verstappen', 'at', 'red', 'bull'],
    ['lando', 'norris', 'joins', 'aston', 'martin'],
    ['esteban', 'ocon', 'linked', 'to', 'alpine'],
    ['fernando', 'alonso', 'retires', 'from', 'formula', 'one'],
    ['valtteri', 'bottas', 'signs', 'with', 'haas'],
    ['daniel', 'ricciardo', 'returns', 'to', 'alpine'],
    ['sebastian', 'vettel', 'moves', 'to', 'alphatauri'],
    ['kevin', 'magnussen', 'stays', 'at', 'haas'],
    ['pierre', 'gasly', 'joins', 'ferrari'],
    ['lance', 'stroll', 'replaces', 'ocon', 'at', 'alpine'],
    ['mick', 'schumacher', 'partners', 'with', 'ferrari'],
    ['yuki', 'tsunoda', 'signs', 'with', 'alphatauri'],
    ['zhou', 'guanyu', 'remains', 'at', 'alfa', 'romeo'],
    ['alexander', 'albon', 'linked', 'to', 'red', 'bull'],
    ['nicholas', 'latifi', 'retires', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'replaces', 'vettel', 'at', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'commits', 'to', 'haas'],
    ['oscar', 'piastri', 'partners', 'with', 'norris', 'at', 'mclaren'],
    ['liam', 'lawson', 'linked', 'to', 'alphatauri'],
    ['logan', 'sargeant', 'moves', 'to', 'williams'],
    ['max', 'verstappen', 'remains', 'at', 'red', 'bull'],
    ['charles', 'leclerc', 'extends', 'deal', 'with', 'ferrari'],
    ['sergio', 'perez', 'switches', 'to', 'aston', 'martin'],
    ['george', 'russell', 'stays', 'at', 'mercedes'],
    ['carlos', 'sainz', 'replaces', 'hamilton', 'at', 'mercedes'],
    ['lewis', 'hamilton', 'linked', 'to', 'ferrari'],
    ['lando', 'norris', 'moves', 'to', 'ferrari'],
    ['esteban', 'ocon', 'remains', 'at', 'alpine'],
    ['fernando', 'alonso', 'retires', 'from', 'formula', 'one'],
    ['valtteri', 'bottas', 'partners', 'with', 'sauber'],
    ['daniel', 'ricciardo', 'replaces', 'verstappen', 'at', 'red', 'bull'],
    ['sebastian', 'vettel', 'joins', 'aston', 'martin'],
    ['kevin', 'magnussen', 'stays', 'with', 'haas'],
    ['pierre', 'gasly', 'leaves', 'alpine', 'for', 'red', 'bull'],
    ['lance', 'stroll', 'partners', 'with', 'alonso', 'at', 'aston', 'martin'],
    ['mick', 'schumacher', 'joins', 'haas'],
    ['yuki', 'tsunoda', 'remains', 'with', 'alphatauri'],
    ['zhou', 'guanyu', 'linked', 'to', 'sauber'],
    ['alexander', 'albon', 'replaces', 'latifi', 'at', 'williams'],
    ['nicholas', 'latifi', 'retires', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'joins', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'partners', 'with', 'haas'],
    ['oscar', 'piastri', 'remains', 'at', 'mclaren'],
    ['liam', 'lawson', 'joins', 'alphatauri'],
    ['logan', 'sargeant', 'commits', 'to', 'williams'],
    ['max', 'verstappen', 'linked', 'to', 'ferrari'],
    ['charles', 'leclerc', 'remains', 'with', 'ferrari'],
    ['sergio', 'perez', 'leaves', 'red', 'bull'],
    ['george', 'russell', 'partners', 'with', 'hamilton', 'at', 'mercedes'],
    ['carlos', 'sainz', 'signs', 'new', 'contract', 'with', 'ferrari'],
    ['lewis', 'hamilton', 'retires', 'from', 'formula', 'one'],
    ['lando', 'norris', 'linked', 'to', 'ferrari'],
    ['esteban', 'ocon', 'commits', 'to', 'alpine'],
    ['fernando', 'alonso', 'remains', 'at', 'aston', 'martin'],
    ['valtteri', 'bottas', 'joins', 'sauber'],
    ['daniel', 'ricciardo', 'partners', 'with', 'verstappen', 'at', 'red', 'bull'],
    ['sebastian', 'vettel', 'linked', 'to', 'haas'],
    ['kevin', 'magnussen', 'remains', 'at', 'haas'],
    ['pierre', 'gasly', 'moves', 'to', 'alpine'],
    ['lance', 'stroll', 'stays', 'with', 'aston', 'martin'],
    ['mick', 'schumacher', 'leaves', 'haas', 'for', 'alfa', 'romeo'],
    ['yuki', 'tsunoda', 'promoted', 'to', 'red', 'bull'],
    ['zhou', 'guanyu', 'partners', 'with', 'alfa', 'romeo'],
    ['alexander', 'albon', 'linked', 'to', 'alphatauri'],
    ['nicholas', 'latifi', 'leaves', 'formula', 'one'],
    ['nyck', 'vries', 'signs', 'with', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'replaces', 'magnussen', 'at', 'haas'],
    ['oscar', 'piastri', 'joins', 'red', 'bull'],
    ['liam', 'lawson', 'partners', 'with', 'alphatauri'],
    ['logan', 'sargeant', 'moves', 'to', 'williams'],
    ['max', 'verstappen', 'signs', 'with', 'mclaren'],
    ['charles', 'leclerc', 'remains', 'at', 'ferrari'],
    ['sergio', 'perez', 'partners', 'with', 'hamilton', 'at', 'mercedes'],
    ['george', 'russell', 'moves', 'to', 'aston', 'martin'],
    ['carlos', 'sainz', 'extends', 'deal', 'with', 'ferrari'],
    ['lewis', 'hamilton', 'retires', 'from', 'formula', 'one'],
    ['lando', 'norris', 'linked', 'to', 'alphatauri'],
    ['esteban', 'ocon', 'stays', 'with', 'alpine'],
    ['fernando', 'alonso', 'replaces', 'stroll', 'at', 'aston', 'martin'],
    ['valtteri', 'bottas', 'partners', 'with', 'alfa', 'romeo'],
    ['daniel', 'ricciardo', 'returns', 'to', 'red', 'bull'],
    ['sebastian', 'vettel', 'linked', 'to', 'haas'],
    ['kevin', 'magnussen', 'commits', 'to', 'haas'],
    ['pierre', 'gasly', 'moves', 'to', 'ferrari'],
    ['lance', 'stroll', 'remains', 'at', 'aston', 'martin'],
    ['mick', 'schumacher', 'joins', 'alpine'],
    ['yuki', 'tsunoda', 'partners', 'with', 'verstappen', 'at', 'red', 'bull'],
    ['zhou', 'guanyu', 'linked', 'to', 'renault'],
    ['alexander', 'albon', 'switches', 'to', 'alphatauri'],
    ['nicholas', 'latifi', 'retires', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'signs', 'with', 'alpine'],
    ['nico', 'hulkenberg', 'replaces', 'schumacher', 'at', 'haas'],
    ['oscar', 'piastri', 'partners', 'with', 'norris', 'at', 'mclaren'],
    ['liam', 'lawson', 'remains', 'at', 'alphatauri'],
    ['logan', 'sargeant', 'moves', 'to', 'williams'],
    ['max', 'verstappen', 'linked', 'to', 'ferrari'],
    ['charles', 'leclerc', 'stays', 'at', 'ferrari'],
    ['sergio', 'perez', 'commits', 'to', 'red', 'bull'],
    ['george', 'russell', 'signs', 'new', 'deal', 'with', 'mercedes'],
    ['carlos', 'sainz', 'partners', 'with', 'leclerc', 'at', 'ferrari'],
    ['lewis', 'hamilton', 'retires', 'from', 'formula', 'one'],
    ['lando', 'norris', 'joins', 'aston', 'martin'],
    ['esteban', 'ocon', 'moves', 'to', 'mclaren'],
    ['fernando', 'alonso', 'switches', 'to', 'alpine'],
    ['valtteri', 'bottas', 'remains', 'at', 'alfa', 'romeo'],
    ['daniel', 'ricciardo', 'returns', 'to', 'alphatauri'],
    ['sebastian', 'vettel', 'partners', 'with', 'gasly', 'at', 'alpine'],
    ['kevin', 'magnussen', 'stays', 'at', 'haas'],
    ['pierre', 'gasly', 'commits', 'to', 'alpine'],
    ['lance', 'stroll', 'remains', 'with', 'aston', 'martin'],
    ['mick', 'schumacher', 'replaces', 'bottas', 'at', 'alfa', 'romeo'],
    ['yuki', 'tsunoda', 'joins', 'red', 'bull'],
    ['zhou', 'guanyu', 'partners', 'with', 'alfa', 'romeo'],
    ['alexander', 'albon', 'switches', 'to', 'alphatauri'],
    ['nicholas', 'latifi', 'retires', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'joins', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'partners', 'with', 'haas'],
    ['oscar', 'piastri', 'stays', 'at', 'mclaren'],
    ['liam', 'lawson', 'commits', 'to', 'alphatauri'],
    ['logan', 'sargeant', 'remains', 'with', 'williams'],
    ['max', 'verstappen', 'linked', 'to', 'ferrari'],
    ['charles', 'leclerc', 'remains', 'at', 'ferrari'],
    ['sergio', 'perez', 'joins', 'aston', 'martin'],
    ['george', 'russell', 'partners', 'with', 'hamilton', 'at', 'mercedes'],
    ['carlos', 'sainz', 'extends', 'contract', 'with', 'ferrari'],
    ['lewis', 'hamilton', 'retires', 'from', 'formula', 'one'],
    ['lando', 'norris', 'linked', 'to', 'ferrari'],
    ['esteban', 'ocon', 'stays', 'at', 'alpine'],
    ['fernando', 'alonso', 'commits', 'to', 'aston', 'martin'],
    ['valtteri', 'bottas', 'switches', 'to', 'sauber'],
    ['daniel', 'ricciardo', 'partners', 'with', 'verstappen', 'at', 'red', 'bull'],
    ['sebastian', 'vettel', 'joins', 'haas'],
    ['kevin', 'magnussen', 'stays', 'with', 'haas'],
    ['pierre', 'gasly', 'leaves', 'alpine', 'for', 'ferrari'],
    ['lance', 'stroll', 'partners', 'with', 'alonso', 'at', 'aston', 'martin'],
    ['mick', 'schumacher', 'joins', 'haas'],
    ['yuki', 'tsunoda', 'remains', 'with', 'alphatauri'],
    ['zhou', 'guanyu', 'partners', 'with', 'sauber'],
    ['alexander', 'albon', 'replaces', 'latifi', 'at', 'williams'],
    ['nicholas', 'latifi', 'retires', 'from', 'formula', 'one'],
    ['nyck', 'vries', 'signs', 'with', 'aston', 'martin'],
    ['nico', 'hulkenberg', 'partners', 'with', 'haas'],
    ['oscar', 'piastri', 'remains', 'at', 'mclaren'],
    ['liam', 'lawson', 'joins', 'alphatauri'],
    ['logan', 'sargeant', 'moves', 'to', 'williams'],
]



In [None]:
# nlp = spacy.blank('en')
# tokenize_spacy = nlp.tokenizer

# f1_df['text'] = f1_df['text'].apply(partial(preprocessing.correct_spelling_in_text_spacy, activator=False))
# f1_df['text'] = f1_df['text'].apply(preprocessing.normalize)
# f1_df['text'] = f1_df['text'].apply(preprocessing.lemmatize)

# tokenized_texts = [list(map(lambda token: token.text, tokenize_spacy(text))) for text in normalized_texts]


f1_df['text'] = f1_df['text'].apply(preprocessing.normalize)

from nltk.stem import PorterStemmer

nlp = spacy.load('en_core_web_sm')
stemmer = PorterStemmer()

all_lemmatized_tokens = [
    [token.lemma_.lower() for token in nlp(text)] 
    for text in f1_df['text']
]

print(all_lemmatized_tokens[:5])



In [17]:
#with display_full_dataframe():
#    display(f1_df)

In [None]:
# Define lists of drivers and teams

drivers = [
    'max', 'verstappen',
    'charles', 'leclerc',
    'sergio', 'perez',
    'george', 'russell',
    'carlos', 'sainz',
    'lewis', 'hamilton',
    'lando', 'norris',
    'esteban', 'ocon',
    'fernando', 'alonso',
    'valtteri', 'bottas',
    'daniel', 'ricciardo',
    'sebastian', 'vettel',
    'kevin', 'magnussen',
    'pierre', 'gasly',
    'lance', 'stroll',
    'mick', 'schumacher',
    'yuki', 'tsunoda',
    'zhou', 'guanyu',
    'alexander', 'albon',
    'nicholas', 'latifi',
    'nyck', 'vries',
    'nico', 'hulkenberg',
    'oscar', 'piastri',
    'liam', 'lawson',
    'logan', 'sargeant'
]

teams = [
    'mercedes',
    'ferrari',
    'red', 'bull',
    'alpine', 'renault',
    'mclaren',
    'aston', 'martin',
    'racing', 'point',
    'alphatauri', 'alpha', 'tauri',
    'haas',
    'alfa', 'romeo',
    'williams',
    'kick', 'sauber'
]

action_words = [
    'to',
    'go',
    'goes',
    'leave',
    'leaves',
    'join',
    'joins',
    'sign',
    'signs',
    'extend',
    'extends',
    'move',
    'moves',
    'replace',
    'replaces',
    'return',
    'returns',
    'stay',
    'stays'
]



# Filter sentences containing both a driver, a team and a action word
'''
def filter_sentences_by_driver_and_team(tokenized_texts, drivers, teams):
    filtered_sentences = []
    for sentence in tokenized_texts:
        contains_driver = any(driver in sentence for driver in drivers)
        contains_team = any(team in sentence for team in teams)
        contains_action_word = any(action in sentence for action in action_words)
        if contains_driver and contains_team and contains_action_word:
            filtered_sentences.append(sentence)
    return filtered_sentences
'''

def filter_sentences_by_driver_and_team(tokenized_texts, drivers, teams, action_words):
    filtered_sentences = []

    for sentence in tokenized_texts:
        contains_team = any(team in sentence for team in teams)
        
        # Check if any driver is followed by an action word
        for i, word in enumerate(sentence):
            if word in drivers and i + 1 < len(sentence) and sentence[i + 1] in action_words:
                if contains_team:  # Ensure a team is mentioned anywhere
                    filtered_sentences.append(sentence)
                    break  # Move to the next sentence after finding a match

    return filtered_sentences

# Apply the filter
filtered_sentences = filter_sentences_by_driver_and_team(all_lemmatized_tokens, drivers, teams, action_words)

print(all_lemmatized_tokens[:5])  # Check the first 5 tokenized sentences


[['discussion', 'could', 'professional', 'esport', 'driver', 'drive', 'a', 'real', 'f1', 'car', 'how', 'realistic', 'be', 'the', 'sim', 'remove'], ['question', 'concern', 'alonsos', 'future', 'remove'], ['verstappen', 'now', 'have', 'as', 'many', 'pole', 'as', 'leclerc', ' ', 'but', 'six', 'time', 'as', 'many', 'win', ' ', '2022', 'monaco', 'grand', 'prix', 'stat', 'and', 'fact'], ['perez', 'win', 'as', 'red', 'bull', 'deliver', 'race', 'strategy', 'blow', 'to', 'ferrari', ' ', 'mika', 'hkkinen', 'thought', 'on', 'the', 'monaco', 'grand', 'prix'], ['the', 'new', 'qualifying', 'since', 'i', 'think', '2021', '2021', 'be', 'way', 'too', 'short', 'for', 'most', 'track', 'imo', 'delete']]


In [19]:
# Create N-gram model

def train_ngram_model(data, n=2):
    ngram_counts = defaultdict(Counter)
    total_counts = Counter()

    for sentence in data:
        sentence = ['<s>'] + sentence + ['</s>']  # Add start and end tokens
        n_grams = list(ngrams(sentence, n))
        for gram in n_grams:
            prefix, next_word = tuple(gram[:-1]), gram[-1]
            ngram_counts[prefix][next_word] += 1
            total_counts[prefix] += 1

    # Convert counts to probabilities
    ngram_probs = {
        prefix: {word: count / total_counts[prefix] for word, count in words.items()}
        for prefix, words in ngram_counts.items()
    }

    return ngram_probs



# Train a bigram model
bigram_model = train_ngram_model(all_lemmatized_tokens, n=2)

# Train a trigram model
trigram_model = train_ngram_model(all_lemmatized_tokens, n=3)

# Train a quadgram model
quadgram_model = train_ngram_model(all_lemmatized_tokens, n=4)


In [20]:
# Create trigram model with Laplace Smoothing

def train_trigram_model_with_smoothing(data, n=3):
    ngram_counts = defaultdict(Counter)
    total_counts = Counter()
    vocabulary = set()

    for sentence in data:
        sentence = ['<s>'] * (n - 1) + sentence + ['</s>']  # Add padding
        n_grams = list(ngrams(sentence, n))
        vocabulary.update(sentence)  # Add tokens to vocabulary
        for gram in n_grams:
            prefix, next_word = tuple(gram[:-1]), gram[-1]
            ngram_counts[prefix][next_word] += 1
            total_counts[prefix] += 1

    # Laplace Smoothing
    vocabulary_size = len(vocabulary)
    trigram_probs = {
        prefix: {word: (count + 1) / (total_counts[prefix] + vocabulary_size)
                 for word, count in words.items()}
        for prefix, words in ngram_counts.items()
    }

    # Ensure all words in the vocabulary have a non-zero probability
    for prefix in ngram_counts.keys():
        for word in vocabulary:
            if word not in trigram_probs[prefix]:
                trigram_probs[prefix][word] = 1 / (total_counts[prefix] + vocabulary_size)

    return trigram_probs, vocabulary



# Train a trigram model with Laplace Smoothing
trigram_model_s, vocabulary = train_trigram_model_with_smoothing(all_lemmatized_tokens, n=3)

In [23]:
# Predict Next Word

def predict_next_word(model, input_text, n=2):
    tokens = input_text.lower().split()
    prefix = tuple(tokens[-(n-1):])  # Use last (n-1) words as prefix
    if prefix in model:
        return max(model[prefix], key=model[prefix].get)  # Return word with highest probability
    else:
        return "<unk>"  # Return unknown token if prefix not found



# Example usage

input_text = "daniel ricciardo"
next_word = predict_next_word(trigram_model, input_text, n=3)
print(f"Next word: {next_word}")


Next word: returns


In [25]:
# Generate full predictions

def generate_predictions(model, seed_text, n=2, max_length=10):
    tokens = seed_text.lower().split()
    for _ in range(max_length):
        next_word = predict_next_word(model, " ".join(tokens), n=n)
        if next_word == "</s>":
            break
        tokens.append(next_word)
    return " ".join(tokens)



# Generate a prediction

seed_text = "daniel ricciardo"
prediction = generate_predictions(trigram_model, seed_text, n=3)
print(f"Generated prediction: {prediction}")


Generated prediction: daniel ricciardo returns to red bull


In [27]:
def predict_next_team(model, teams, input_text, n=3):
    """
    Predict the next word from the model, forcing it to be a team name.
    
    Args:
    - model: Trained N-gram model with probabilities.
    - teams: Set of valid team names.
    - input_text: Input text string (e.g., "ricciardo joins").
    - n: N-gram size (default: 3).
    
    Returns:
    - Predicted team name or "<unk>" if no team matches.
    """
    tokens = input_text.lower().split()
    prefix = tuple(tokens[-(n - 1):])  # Last (n-1) words as the prefix
    
    if prefix in model:
        # Filter predictions to include only team names
        team_predictions = {word: prob for word, prob in model[prefix].items() if word in teams}
        if team_predictions:
            return max(team_predictions, key=team_predictions.get)  # Team with highest probability
    return "<unk>"  # Return "<unk>" if no team matches



input_text = "ricciardo to"
next_word = predict_next_team(trigram_model, teams, input_text, n=3)
print(f"Next predicted team: {next_word}")


Next predicted team: <unk>


In [34]:
def predict_top_n_for_input(model, input_text, n=5, ngram_size=3):

    tokens = tokens = input_text.split()
    prefix = tuple(tokens[-(ngram_size - 1):])  # Get the last (n-1) tokens

    if prefix in model:
        # Sort predictions by probability and extract the top N
        top_predictions = sorted(model[prefix].items(), key=lambda x: x[1], reverse=True)[:n]
        # Create DataFrame for easy display
        df = pd.DataFrame(top_predictions, columns=["Word", "Probability"])
        return df
    else:
        print("Prefix not found in the model. No predictions available.")
        return pd.DataFrame(columns=["Word", "Probability"])



# Example Usage
input_text = "verstappen to"
top_predictions = predict_top_n_for_input(trigram_model_s, input_text, n=5)
print(top_predictions)

Prefix not found in the model. No predictions available.
Empty DataFrame
Columns: [Word, Probability]
Index: []


In [50]:
team_summary = dict()

def sort_to_team(team, driver):

    if team not in team_summary:
        team_summary[team] = {driver}
    else:
        team_summary[team].add(driver)


def print_team_summary(summary):
   
    print("Team Summary:")
    for team, drivers in summary.items():
        print(f"  {team}: {', '.join(drivers)}")

In [51]:
input_text = "max verstappen to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "charles leclerc to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "sergio perez to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "george russell to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "carlos sainz to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "lewis hamilton to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "lando norris to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "esteban ocon to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "fernando alonso to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "valtteri bottas to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "daniel ricciardo to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "sebastian vettel to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "kevin magnussen to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "pierre gasly to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "lance stroll to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "mick schumacher to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "yuki tsunoda to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "zhou guanyu to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "alexander albon to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "nicholas latifi to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "nyck de vries to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "nico hulkenberg to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "oscar piastri to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "liam lawson to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])

input_text = "logan sargeant to"
next_word = predict_next_team(trigram_model_s, teams, input_text, n=3)
print(f"Next predicted team for {input_text[:-3]}: {next_word}")
sort_to_team(next_word, input_text[:-3])


Next predicted team for max verstappen: <unk>
Next predicted team for charles leclerc: <unk>
Next predicted team for sergio perez: <unk>
Next predicted team for george russell: <unk>
Next predicted team for carlos sainz: <unk>
Next predicted team for lewis hamilton: <unk>
Next predicted team for lando norris: <unk>
Next predicted team for esteban ocon: <unk>
Next predicted team for fernando alonso: <unk>
Next predicted team for valtteri bottas: <unk>
Next predicted team for daniel ricciardo: <unk>
Next predicted team for sebastian vettel: <unk>
Next predicted team for kevin magnussen: <unk>
Next predicted team for pierre gasly: <unk>
Next predicted team for lance stroll: <unk>
Next predicted team for mick schumacher: <unk>
Next predicted team for yuki tsunoda: <unk>
Next predicted team for zhou guanyu: <unk>
Next predicted team for alexander albon: <unk>
Next predicted team for nicholas latifi: <unk>
Next predicted team for nyck de vries: <unk>
Next predicted team for nico hulkenberg: 

In [52]:
print_team_summary(team_summary)

Team Summary:
  <unk>: zhou guanyu, esteban ocon, kevin magnussen, charles leclerc, liam lawson, pierre gasly, lance stroll, max verstappen, logan sargeant, daniel ricciardo, nico hulkenberg, sergio perez, nicholas latifi, george russell, lando norris, carlos sainz, lewis hamilton, nyck de vries, valtteri bottas, mick schumacher, alexander albon, oscar piastri, sebastian vettel, fernando alonso, yuki tsunoda
