In [3]:
from collections.abc import Generator, Callable
from pathlib import Path
import typing
from typing import Any, TypeAlias
import pandas as pd
import numpy as np
import datetime as dt
import re
from functools import partial, reduce
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

from config.fastf1 import fastf1
from config import config
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
from src.data.preprocessing import concatenate_submissions_and_comments
import src.data.constants as dataset_constants

from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

In [4]:
f1_ndjson_streamer = partial(stream_ndjson, limit=5000)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f15_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS)
f15_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS)

In [None]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f15_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f15_comments_df.head(n))

In [None]:
f1_df = concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = concatenate_submissions_and_comments(f15_submissions_df, f15_comments_df)

n = 3

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

In [7]:
# TODO:

In [8]:
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter
import math

In [9]:
# Step 1: Prepare Data
# Assuming `processed_data` is a list of preprocessed sentences (tokenized, lemmatized, etc.)
processed_data = [
    ['ricciardo', 'to', 'red', 'bull'],
    ['hamilton', 'to', 'stay', 'mercedes'],
    ['alonso', 'to', 'aston', 'martin'],
    # Add more sentences from your dataset
]

preprocessed_data = []

In [10]:
# Step 2: Create N-grams and Train Model
def train_ngram_model(data, n=2):
    ngram_counts = defaultdict(Counter)
    total_counts = Counter()

    for sentence in data:
        sentence = ['<s>'] + sentence + ['</s>']  # Add start and end tokens
        n_grams = list(ngrams(sentence, n))
        for gram in n_grams:
            prefix, next_word = tuple(gram[:-1]), gram[-1]
            ngram_counts[prefix][next_word] += 1
            total_counts[prefix] += 1

    # Convert counts to probabilities
    ngram_probs = {
        prefix: {word: count / total_counts[prefix] for word, count in words.items()}
        for prefix, words in ngram_counts.items()
    }

    return ngram_probs

# Train a bigram model
bigram_model = train_ngram_model(processed_data, n=2)

In [None]:
# Step 3: Predict Next Word
def predict_next_word(model, input_text, n=2):
    tokens = input_text.lower().split()
    prefix = tuple(tokens[-(n-1):])  # Use last (n-1) words as prefix
    if prefix in model:
        return max(model[prefix], key=model[prefix].get)  # Return word with highest probability
    else:
        return "<unk>"  # Return unknown token if prefix not found

# Example usage
input_text = "ricciardo to"
next_word = predict_next_word(bigram_model, input_text, n=2)
print(f"Next word: {next_word}")

# Bonus: Generate full predictions
def generate_predictions(model, seed_text, n=2, max_length=10):
    tokens = seed_text.lower().split()
    for _ in range(max_length):
        next_word = predict_next_word(model, " ".join(tokens), n=n)
        if next_word == "</s>":
            break
        tokens.append(next_word)
    return " ".join(tokens)

# Generate a prediction
seed_text = "ricciardo to"
prediction = generate_predictions(bigram_model, seed_text, n=2)
print(f"Generated prediction: {prediction}")