In [None]:
from collections.abc import Generator, Callable
from pathlib import Path
import typing
from typing import Any, TypeAlias
import pandas as pd
import numpy as np
import datetime as dt
import re
from functools import partial, reduce
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

import importlib

from config.fastf1 import fastf1
from config import config
import src.data.constants as dataset_constants
importlib.reload(dataset_constants);
import src.data.loader
importlib.reload(src.data.loader);
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
from src.data.preprocessing import concatenate_submissions_and_comments

from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()
DEVICE = utils.get_device()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

# Load data and find submissions related to steward decisions

In [7]:
f1_submissions_df = load_submissions_df(
    dataset_constants.RawFile.FORMULA1_SUBMISSIONS,
    columns=dataset_constants.DEFAULT_SUBMISSION_COLUMNS | {'permalink', 'post_hint', 'link_flair_text'},
)                                  

f1_comments_df = load_comments_df(
    dataset_constants.RawFile.FORMULA1_COMMENTS,
    columns=dataset_constants.DEFAULT_COMMENT_COLUMNS | {'link_id'},
)

In [44]:
# TODO: testing purposes
f1_submissions_df['permalink'] = 'www.reddit.com' + f1_submissions_df['permalink']

In [None]:
steward_decision_related_words = {
    'penalty', 'steward', 'decision', 'appeal', 'review', 'ruling', 'investigation', 'regulation',
    'seconds', 'sec', 
    'collision', 'crash', 'incident', 'overtake', 'virtual safety car', 'blocking', 'brake test', 'contact',
    'red flag', 'yellow flag', 
    'controversial', 'rigged', 'corrupt', 'bias', 'protest', 'FIA', 'document', 'infringement'}

words_regex = ''.join(fr'\b{word}\b|' for word in steward_decision_related_words)[:-1]
steward_decision_pattern = re.compile(words_regex, flags=re.IGNORECASE)

relevant_flairs = {':post-discussion: Discussion', ':post-technical: Technical', ':post-news: News'}

has_related_words = f1_submissions_df['title'].apply(lambda title: steward_decision_pattern.search(title) is not None)
has_relevant_flairs = f1_submissions_df['link_flair_text'].isin(relevant_flairs)
is_image_post = f1_submissions_df['post_hint'] == 'image'

steward_decision_submissions_df = f1_submissions_df[has_related_words & has_relevant_flairs & is_image_post].copy()

with display_full_dataframe():
    print(len(steward_decision_submissions_df))
    # display(steward_decision_submissions_df)
    display(steward_decision_submissions_df.head(2))
    display(steward_decision_submissions_df['link_flair_text'].unique())

In [None]:
relevant_flairs_extended = {
    ':post-technical: Technical',
    ':post-discussion: Discussion',
    ':post-timed: Timed',
    ':post-news: News',
    ':post-misc: Misc',
    ':post-photo: Photo',
    ':post-statistics: Statistics',
    ':post-video: Video',
    ':post-daily-discussion: Daily Discussion',
    ':post-formula-1: /r/Formula1',
    ':post-photo: Photo /r/all',
    ':post-news: News /r/all',
    ':post-grand-prix: Free Practice',
    ':post-grand-prix: Qualifying',
    ':post-highlight: Highlight',
    ':post-post-session: Post-Qualifying',
    ':post-pre-session: Pre-Race',
    ':post-grand-prix: Race',
    ':post-highlight: Highlight /r/all',
    ':post-post-session: Post-Race',
    ':post-post-session: Day after Debrief',
    ':post-featured: Featured',
    ':post-discussion: Discussion /r/all',
    ':post-grand-prix: Sprint',
    ':post-post-session: Post-Sprint',
    ':post-technical: Technical /r/all',
}

has_related_words = f1_submissions_df['title'].apply(lambda title: steward_decision_pattern.search(title) is not None)
has_relevant_flairs = f1_submissions_df['link_flair_text'].isin(relevant_flairs_extended)
is_image_post = f1_submissions_df['post_hint'] == 'image'

_extended_steward_decision_submissions_df = f1_submissions_df[has_related_words & has_relevant_flairs & is_image_post].copy()


# Merge the two dataframes with an indicator column
diff = _extended_steward_decision_submissions_df.merge(steward_decision_submissions_df, how='left', indicator=True)

# Keep only rows that are in dataframeA but not in dataframeB
result = diff[diff['_merge'] == 'left_only'].drop(columns=['_merge'])

with display_full_dataframe():
    print(f'{len(_extended_steward_decision_submissions_df)=}')
    print(f'{len(result)=}')
    display(result)
    display(_extended_steward_decision_submissions_df['link_flair_text'].unique())

# Pre-trained models

In [None]:
def to_sentiment_category(sentiment: float) -> str:
    if sentiment >= 0.05:
        return 'Positive'
    elif sentiment <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

### VADER SENTIMENT ANALYSIS

In [55]:
# TODO: testing
steward_decision_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, partial(stream_ndjson, limit=100))

In [56]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
for index, steward_decision_submission in steward_decision_submissions_df.iterrows():
    submission_link_id = f't3_{steward_decision_submission['id']}'
    comments_df = f1_comments_df[f1_comments_df['link_id'] == submission_link_id].copy()

    if comments_df.empty:
        steward_decision_submissions_df.loc[index, 'average_sentiment_vader'] = np.nan
        continue
    
    NOEMER = np.abs(comments_df['score']).sum()
    
    if NOEMER == 0:
        steward_decision_submissions_df.loc[index, 'average_sentiment_vader'] = np.nan
        continue


    comments_df.loc[:, 'compound'] = comments_df['body'].apply(
        lambda text: analyzer.polarity_scores(text)['compound']
    )   #creates a data series with a score (float) given an input comment

    sentiment = (comments_df['compound'] * comments_df['score']).sum() / NOEMER
    steward_decision_submissions_df.loc[index, 'average_sentiment_vader'] = sentiment

with display_full_dataframe():
    display(steward_decision_submissions_df.head(2))

In [None]:
# TOOD: vectorize if efficiency is needed
steward_decision_submissions_df['submission_link_id'] = 't3_' + steward_decision_submissions_df['id']

filtered_comments_df = f1_comments_df[
    f1_comments_df['link_id'].isin(steward_decision_submissions_df['submission_link_id'])
].copy()

filtered_comments_df['compound'] = filtered_comments_df['body'].apply(
    lambda text: analyzer.polarity_scores(text)['compound']
)

def calculate_weighted_sentiment(group):
    NOEMER = np.abs(group['score']).sum()
    if NOEMER == 0:
        return np.nan
    return (group['compound'] * group['score']).sum() / NOEMER

average_sentiment = filtered_comments_df.groupby('link_id').apply(calculate_weighted_sentiment)

steward_decision_submissions_df['average_sentiment_vader'] = \
    steward_decision_submissions_df['submission_link_id'].map(average_sentiment)

steward_decision_submissions_df.drop(columns=['submission_link_id'], inplace=True)


### BERT SENTIMENT ANALYSIS

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model.to(DEVICE);

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

def BERT_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    sentiment_score = torch.argmax(logits, dim=1).item()  # 0: negative, 1: neutral, 2: positive
    return (sentiment_score - 1)

for index, steward_decision_submission in steward_decision_submissions_df.iterrows():
    submission_link_id = f't3_{steward_decision_submission['id']}'
    comments_df = f1_comments_df[f1_comments_df['link_id'] == submission_link_id].copy()

    if comments_df.empty:
        steward_decision_submissions_df.loc[index, 'average_sentiment_bert'] = np.nan
        continue
    
    NOEMER = np.abs(comments_df['score']).sum()
    
    if NOEMER == 0:
        steward_decision_submissions_df.loc[index, 'average_sentiment_bert'] = np.nan
        continue

    comments_df.loc[:, 'compound'] = comments_df['body'].apply(BERT_sentiment)
        # lambda text: analyzer.polarity_scores(text)['compound']
        # bert induced sentiment

    sentiment = (comments_df['compound'] * comments_df['score']).sum() / NOEMER
    steward_decision_submissions_df.loc[index, 'average_sentiment_bert'] = sentiment

# with display_full_dataframe():
    # display(steward_decision_submissions_df)
    # display(steward_decision_submissions_df['average_sentiment_bert'])

In [None]:
print(
    np.abs(steward_decision_submissions_df['average_sentiment_bert'] - steward_decision_submissions_df['average_sentiment_vader']).sum() \
    / len(steward_decision_submissions_df.index)
)

with display_full_dataframe():
    display(steward_decision_submissions_df)