In [None]:
from collections.abc import Generator, Callable
from pathlib import Path
import typing
from typing import Any, TypeAlias
import pandas as pd
import numpy as np
import datetime as dt
import re
from functools import partial, reduce
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

import importlib

from config.fastf1 import fastf1
from config import config
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
from src.data.preprocessing import concatenate_submissions_and_comments
import src.data.constants as dataset_constants
import src.data.loader
importlib.reload(src.data.loader);
importlib.reload(dataset_constants);

from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()
DEVICE = utils.get_device()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

In [23]:
f1_ndjson_streamer = partial(stream_ndjson, limit=10)
f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS)

In [None]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f1_comments_df.head(n))

In [None]:
f1_df = concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)

n = 8

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

In [26]:
Steward_keywords = {
    'penalty', 'steward', 'decision', 'appeal', 'review', 'ruling', 'investigation', 'regulation',
    'seconds', 'sec', 
    'collision', 'crash', 'incident', 'overtake', 'virtual safety car', 'blocking', 'brake test', 'contact',
    'red flag', 'yellow flag', 
    'controversial', 'rigged', 'corrupt', 'bias', 'protest', 'FIA'
    }

In [27]:
f1_submissions_df = load_submissions_df(
    dataset_constants.RawFile.FORMULA1_SUBMISSIONS,
    columns=dataset_constants.SUBMISSION_COLUMNS | {'permalink'},
)
f1_comments_df = load_comments_df(
    dataset_constants.RawFile.FORMULA1_COMMENTS,
    columns=dataset_constants.COMMENT_COLUMNS | {'link_id'},
)

In [None]:
sample_submission_permalink = '/r/formula1/comments/yfcxll/fia_document_of_stewards_decision_for_alpines/'
sample_submission = f1_submissions_df[f1_submissions_df['permalink'] == sample_submission_permalink].iloc[0]
sample_submission_link_id = f't3_{sample_submission['id']}'

with display_full_dataframe():
    display(sample_submission)

comments_in_sample_submission = f1_comments_df[f1_comments_df['link_id'] == sample_submission_link_id]

with display_full_dataframe():
    display(comments_in_sample_submission)

display(comments_in_sample_submission['body'])

In [30]:
# EXTRACT RELEVANT POSTS AND COMMENTS 


# VADER SENTIMENT ANALYSIS

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# compound is a number indicating the positiveness, negativeness of a document, normalizing between -1 and 1.

analyzer = SentimentIntensityAnalyzer()

comments_in_sample_submission['compound'] = comments_in_sample_submission['body'].apply(
    lambda text: analyzer.polarity_scores(text)['compound']
)   #creates a data series with a score (float) given an input comment

NOEMER = np.abs(comments_in_sample_submission['score']).sum()
average_sentiment = (comments_in_sample_submission['compound'] * comments_in_sample_submission['score']).sum() / NOEMER    
    
print(average_sentiment)

def Vader_sentiment(average_sentiment):
    if average_sentiment >= 0.05:
        return "Positive"
    elif average_sentiment <= -0.05:
        return "Negative"
    else:
        return "Neutral"

print(Vader_sentiment(average_sentiment))

In [32]:
# integrating sentiment_VADER 




# BERT SENTIMENT ANALYSIS

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model.to(DEVICE);


In [None]:
document_pattern = re.compile(r'\bdocument\b', flags=re.IGNORECASE)
steward_document_submissions_df = f1_submissions_df[f1_submissions_df['title'].apply(lambda title: document_pattern.search(title) is not None)].copy()

# with display_full_dataframe():
#     display(steward_document_submissions_df.head(3))


for index, steward_document_submission in steward_document_submissions_df.iterrows():
    submission_link_id = f't3_{steward_document_submission['id']}'
    comments_df = f1_comments_df[f1_comments_df['link_id'] == submission_link_id].copy()

    # display(comments_df)
    # print(comments_df.empty)
    # x = comments_df.empty
    # break

    if comments_df.empty:
        steward_document_submissions_df.loc[index, 'average_sentiment'] = np.nan
        continue
    
    NOEMER = np.abs(comments_df['score']).sum()
    
    if NOEMER == 0:
        steward_document_submissions_df.loc[index, 'average_sentiment'] = np.nan
        continue


    comments_df.loc[:, 'compound'] = comments_df['body'].apply(
        lambda text: analyzer.polarity_scores(text)['compound']
    )   #creates a data series with a score (float) given an input comment


    # if NOEMER != 0:
    average_sentiment = (comments_df['compound'] * comments_df['score']).sum() / NOEMER
    steward_document_submissions_df.loc[index, 'average_sentiment'] = average_sentiment

    # else: 
    #     average_sentiment = 0.0
    
    # score = 0
    # compound_list = []
    # for _, comment in comments_df.iterrows():
    #     compound_list.append(analyzer.polarity_scores(comment['body'])['compound'])
    #     comment_score = comments_df['score']
    #     score += np.abs(comment_score)

    # compound_sum = np.array(compound_list).sum()
    # x = comments_df['compound'] * comments_df['score'] / compound_sum
    # print(Vader_sentiment(average_sentiment))
    # print(average_sentiment)
    


with display_full_dataframe():
    display(steward_document_submissions_df)
    display(steward_document_submissions_df['average_sentiment'])

# display(comments_in_sample_submission['body'])