In [None]:
from pprint import pprint
from collections.abc import Generator, Callable
from pathlib import Path
import typing
from typing import Any, TypeAlias
import pandas as pd
import numpy as np
import datetime as dt
import re
from functools import partial, reduce
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

import spacy
from nltk.tokenize import word_tokenize as tokenize_nltk
import nltk
nltk.download('punkt_tab')

from config.fastf1 import fastf1
from config import config
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
from src.data import preprocessing
import src.data.constants as dataset_constants

from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)



In [282]:
f1_ndjson_streamer = partial(stream_ndjson, limit=5000)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f15_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS)
f15_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS)

In [None]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

In [None]:
f1_df = preprocessing.concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = preprocessing.concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

n = 3

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

In [285]:
# Imports

import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
import math

In [286]:
# Prepare Data

processed_testdata = [
    ['ricciardo', 'to', 'red', 'bull'],
    ['hamilton', 'to', 'stay', 'mercedes'],
    ['alonso', 'to', 'aston', 'martin'],
    ['max', 'verstappen', 'to', 'ferrari'],
    ['max', 'verstappen', 'stay', 'red', 'bull'],
    ['max', 'verstappen', 'stay', 'by', 'red', 'bull']
]

In [287]:
nlp = spacy.blank('en')
tokenize_spacy = nlp.tokenizer
normalized_texts = list(f1_df['text'].apply(preprocessing.normalize))


# tokenized_texts = [list(map(lambda token: token.text, tokenize_spacy(text))) for text in normalized_texts]
tokenized_texts = [tokenize_nltk(text) for text in normalized_texts]

In [None]:
# Define lists of drivers and teams

drivers = [
    'max', 'verstappen',
    'charles', 'leclerc',
    'sergio', 'perez',
    'george', 'russell',
    'carlos', 'sainz',
    'lewis', 'hamilton',
    'lando', 'norris',
    'esteban', 'ocon',
    'fernando', 'alonso',
    'valtteri', 'bottas',
    'daniel', 'ricciardo',
    'sebastian', 'vettel',
    'kevin', 'magnussen',
    'pierre', 'gasly',
    'lance', 'stroll',
    'mick', 'schumacher',
    'yuki', 'tsunoda',
    'zhou', 'guanyu',
    'alexander', 'albon',
    'nicholas', 'latifi',
    'nyck', 'vries',
    'nico', 'hulkenberg',
    'oscar', 'piastri',
    'liam', 'lawson',
    'logan', 'sargeant'
]

teams = [
    'mercedes',
    'ferrari',
    'red', 'bull',
    'alpine', 'renault',
    'mclaren',
    'aston', 'martin',
    'racing', 'point',
    'alphatauri', 'alpha', 'tauri',
    'haas',
    'alfa', 'romeo',
    'williams',
    'kick', 'sauber'
]

action_words = [
    'go',
    'goes',
    'leave',
    'leaves',
    'join',
    'joins',
    'sign',
    'signs',
    'extend',
    'extends',
    'move',
    'moves',
    'replace',
    'replaces',
    'return',
    'returns',
    'stay',
    'stays'
]

# Filter sentences containing both a driver and a team
def filter_sentences_by_driver_and_team(tokenized_texts, drivers, teams):
    filtered_sentences = []
    for sentence in tokenized_texts:
        contains_driver = any(driver in sentence for driver in drivers)
        contains_team = any(team in sentence for team in teams)
        contains_action_word = any(action in sentence for action in action_words)
        if contains_driver and contains_team and contains_action_word:
            filtered_sentences.append(sentence)
    return filtered_sentences


# Apply the filter
filtered_sentences = filter_sentences_by_driver_and_team(tokenized_texts, drivers, teams)

print(filtered_sentences[1])


In [289]:
# Create N-gram model

def train_ngram_model(data, n=2):
    ngram_counts = defaultdict(Counter)
    total_counts = Counter()

    for sentence in data:
        sentence = ['<s>'] + sentence + ['</s>']  # Add start and end tokens
        n_grams = list(ngrams(sentence, n))
        for gram in n_grams:
            prefix, next_word = tuple(gram[:-1]), gram[-1]
            ngram_counts[prefix][next_word] += 1
            total_counts[prefix] += 1

    # Convert counts to probabilities
    ngram_probs = {
        prefix: {word: count / total_counts[prefix] for word, count in words.items()}
        for prefix, words in ngram_counts.items()
    }

    return ngram_probs



# Train a bigram model
bigram_model = train_ngram_model(filtered_sentences, n=2)

# Train a trigram model
trigram_model = train_ngram_model(filtered_sentences, n=3)

# Train a quadgram model
quadgram_model = train_ngram_model(filtered_sentences, n=4)


In [None]:
# Create trigram model with Laplace Smoothing

def train_trigram_model_with_smoothing(data, n=3):
    ngram_counts = defaultdict(Counter)
    total_counts = Counter()
    vocabulary = set()

    for sentence in data:
        sentence = ['<s>'] * (n - 1) + sentence + ['</s>']  # Add padding
        n_grams = list(ngrams(sentence, n))
        vocabulary.update(sentence)  # Add tokens to vocabulary
        for gram in n_grams:
            prefix, next_word = tuple(gram[:-1]), gram[-1]
            ngram_counts[prefix][next_word] += 1
            total_counts[prefix] += 1

    # Laplace Smoothing
    vocabulary_size = len(vocabulary)
    trigram_probs = {
        prefix: {word: (count + 1) / (total_counts[prefix] + vocabulary_size)
                 for word, count in words.items()}
        for prefix, words in ngram_counts.items()
    }

    # Ensure all words in the vocabulary have a non-zero probability
    for prefix in ngram_counts.keys():
        for word in vocabulary:
            if word not in trigram_probs[prefix]:
                trigram_probs[prefix][word] = 1 / (total_counts[prefix] + vocabulary_size)

    return trigram_probs, vocabulary



# Train a trigram model with Laplace Smoothing
trigram_model_s = train_trigram_model_with_smoothing(filtered_sentences, n=3)

In [None]:
# Predict Next Word

def predict_next_word(model, input_text, n=2):
    tokens = input_text.lower().split()
    prefix = tuple(tokens[-(n-1):])  # Use last (n-1) words as prefix
    if prefix in model:
        return max(model[prefix], key=model[prefix].get)  # Return word with highest probability
    else:
        return "<unk>"  # Return unknown token if prefix not found



# Example usage

input_text = "daniel ricciardo"
next_word = predict_next_word(bigram_model, input_text, n=2)
print(f"Next word: {next_word}")

input_text = "lewis hamilton"
next_word = predict_next_word(bigram_model, input_text, n=2)
print(f"Next word: {next_word}")

input_text = "daniel ricciardo"
next_word = predict_next_word(trigram_model, input_text, n=3)
print(f"Next word: {next_word}")

input_text = "lewis hamilton"
next_word = predict_next_word(trigram_model, input_text, n=3)
print(f"Next word: {next_word}")

input_text = "max verstappen"
next_word = predict_next_word(trigram_model, input_text, n=3)
print(f"Next word: {next_word}")

input_text = "max verstappen stays"
next_word = predict_next_word(quadgram_model, input_text, n=4)
print(f"Next word: {next_word}")


In [None]:
# Generate full predictions

def generate_predictions(model, seed_text, n=2, max_length=10):
    tokens = seed_text.lower().split()
    for _ in range(max_length):
        next_word = predict_next_word(model, " ".join(tokens), n=n)
        if next_word == "</s>":
            break
        tokens.append(next_word)
    return " ".join(tokens)



# Generate a prediction

seed_text = "daniel ricciardo"
prediction = generate_predictions(bigram_model, seed_text, n=2)
print(f"Generated prediction: {prediction}")

seed_text = "lewis hamilton"
prediction = generate_predictions(bigram_model, seed_text, n=2)
print(f"Generated prediction: {prediction}")

seed_text = "daniel ricciardo"
prediction = generate_predictions(trigram_model, seed_text, n=3)
print(f"Generated prediction: {prediction}")

seed_text = "lewis hamilton"
prediction = generate_predictions(trigram_model, seed_text, n=3)
print(f"Generated prediction: {prediction}")

seed_text = "max verstappen"
prediction = generate_predictions(trigram_model, seed_text, n=3)
print(f"Generated prediction: {prediction}")

seed_text = "sergio perez"
prediction = generate_predictions(trigram_model, seed_text, n=3)
print(f"Generated prediction: {prediction}")

seed_text = "perez to"
prediction = generate_predictions(trigram_model_s, seed_text, n=3)
print(f"Generated prediction: {prediction}")
