In [None]:
from collections.abc import Generator, Callable
from pathlib import Path
import typing
from typing import Any, TypeAlias
import pandas as pd
import numpy as np
import datetime as dt
import re
from functools import partial, reduce
from tqdm import tqdm
from IPython.display import (
    display, # type: ignore[reportUnknownVariableType]
    Markdown,
)

import importlib

from config.fastf1 import fastf1
from config import config
import src.data.constants as dataset_constants
importlib.reload(dataset_constants);
import src.data.loader
importlib.reload(src.data.loader);
from src.data.loader import stream_ndjson, load_submissions_df, load_comments_df
from src.data.preprocessing import concatenate_submissions_and_comments

from src.utils import (
    temporary_pandas_options,
    display_full_dataframe,
    hide_index,
    compose,
)
from src import utils
utils.set_random_seeds()
DEVICE = utils.get_device()

import logging
logging.getLogger('fastf1').setLevel(logging.WARNING)

In [9]:
f1_ndjson_streamer = partial(stream_ndjson, limit=10)
f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1_SUBMISSIONS, f1_ndjson_streamer)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1_COMMENTS, f1_ndjson_streamer)

f1_submissions_df = load_submissions_df(dataset_constants.RawFile.FORMULA1POINT5_SUBMISSIONS)
f1_comments_df = load_comments_df(dataset_constants.RawFile.FORMULA1POINT5_COMMENTS)

In [None]:
n = 4

with display_full_dataframe():
    display(Markdown('### r/formula1 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1 comments:'), f1_comments_df.head(n))
    display(Markdown('### r/formula1point5 submissions:'), f1_submissions_df.head(n))
    display(Markdown('### r/formula1point5 comments:'), f1_comments_df.head(n))

In [None]:
f1_df = concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)
f15_df = concatenate_submissions_and_comments(f1_submissions_df, f1_comments_df)

n = 8

with display_full_dataframe():
    display(Markdown('### r/formula1 posts:'), f1_df.head(n))
    display(Markdown('### r/formula1point5 posts:'), f15_df.head(n))

In [12]:
Steward_keywords = {
    'penalty', 'steward', 'decision', 'appeal', 'review', 'ruling', 'investigation', 'regulation',
    'seconds', 'sec', 
    'collision', 'crash', 'incident', 'overtake', 'virtual safety car', 'blocking', 'brake test', 'contact',
    'red flag', 'yellow flag', 
    'controversial', 'rigged', 'corrupt', 'bias', 'protest', 'FIA'
    }

In [13]:
f1_submissions_df = load_submissions_df(
    dataset_constants.RawFile.FORMULA1_SUBMISSIONS,
    columns=dataset_constants.DEFAULT_SUBMISSION_COLUMNS | {'permalink', 'post_hint'},
)                                  

f1_comments_df = load_comments_df(
    dataset_constants.RawFile.FORMULA1_COMMENTS,
    columns=dataset_constants.DEFAULT_COMMENT_COLUMNS | {'link_id'},
)

In [None]:
#trying to retrieve relevant posts, related to news, technical and discussion

relevant_tags = ['post-news', 'post-discussion', 'post-technical']   
filtered_f1_submissions_df = f1_submissions_df[f1_submissions_df['link_flair_richtext'].isin(relevant_tags)]

In [None]:
sample_submission_permalink = '/r/formula1/comments/yfcxll/fia_document_of_stewards_decision_for_alpines/'
sample_submission = f1_submissions_df[f1_submissions_df['permalink'] == sample_submission_permalink].iloc[0]
sample_submission_link_id = f't3_{sample_submission['id']}'

with display_full_dataframe():
    display(sample_submission)

comments_in_sample_submission = f1_comments_df[f1_comments_df['link_id'] == sample_submission_link_id]

with display_full_dataframe():
    display(comments_in_sample_submission)

display(comments_in_sample_submission['body'])

# VADER SENTIMENT ANALYSIS

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# compound is a number indicating the positiveness, negativeness of a document, normalizing between -1 and 1.

analyzer = SentimentIntensityAnalyzer()

comments_in_sample_submission['compound'] = comments_in_sample_submission['body'].apply(
    lambda text: analyzer.polarity_scores(text)['compound']
)   #creates a data series with a score (float) given an input comment

NOEMER = np.abs(comments_in_sample_submission['score']).sum()
average_sentiment = (comments_in_sample_submission['compound'] * comments_in_sample_submission['score']).sum() / NOEMER    
    
print(average_sentiment)

def Vader_sentiment(average_sentiment):
    if average_sentiment >= 0.05:
        return "Positive"
    elif average_sentiment <= -0.05:
        return "Negative"
    else:
        return "Neutral"

print(Vader_sentiment(average_sentiment))

# BERT SENTIMENT ANALYSIS

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model.to(DEVICE);


In [None]:
import json
import pandas as pd
from pathlib import Path

# Set up the path to your dataset
data = Path("data/raw/formula1_comments.ndjson")

# Limit the number of records to process (e.g., 1000)
n = 1000
limited_data = []

# Open the file and read only the first 'n' lines
with data.open('r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        if i >= n:
            break  # Stop after reading 'n' lines
        limited_data.append(json.loads(line))  # Parse each line as JSON

# Create a DataFrame with the limited data
df = pd.DataFrame(limited_data)

# Print the columns of the DataFrame to understand the structure
print("Columns in DataFrame:", df.columns)

# Assuming dataset_constants.SUBMISSION_COLUMN_DTYPES is defined correctly
# Example of dataset_constants (adjust based on your DataFrame columns)
dataset_constants.SUBMISSION_COLUMN_DTYPES = {
    'author': 'str',
    'created_utc': 'float64',
    'body': 'str',  # Adjust to your actual column name (e.g., 'body' instead of 'selftext')
    # Add other columns and types here as needed
}

# Apply dtype conversion to existing columns
df = df.astype(dataset_constants.SUBMISSION_COLUMN_DTYPES)

# Display the DataFrame with additional details
with display_full_dataframe():
    display(df)
    display(type(df['preview'].iloc[0]))
    display(type(df.dtypes['author_flair_background_color']))
    display(df.dtypes)


Some old code below, better not to throw away

In [25]:
# import json
# # data = config.DATA_DIR / 'raw'  'formula1_submissions'
# data = Path("data/raw/formula1_comments.ndjson")
# with data.open('r') as file:
#     df = pd.DataFrame((json.load(file),)).astype(dataset_constants.SUBMISSION_COLUMN_DTYPES)

# with display_full_dataframe():
#     display(df)
#     display(type(df['preview'].iloc[0]))
#     display(type(df.dtypes['author_flair_background_color']))
#     display(df.dtypes)





In [None]:
steward_decision_related_words = {
    'penalty', 'steward', 'decision', 'appeal', 'review', 'ruling', 'investigation', 'regulation',
    'seconds', 'sec', 
    'collision', 'crash', 'incident', 'overtake', 'virtual safety car', 'blocking', 'brake test', 'contact',
    'red flag', 'yellow flag', 
    'controversial', 'rigged', 'corrupt', 'bias', 'protest', 'FIA', 'document'}

words_regex = ''.join(fr'\b{word}\b|' for word in steward_decision_related_words)[:-1]
steward_decision_pattern = re.compile(words_regex, flags=re.IGNORECASE)

# steward_decision_pattern = re.compile(r'\bdocument\b', flags=re.IGNORECASE)
steward_decision_submissions_df = f1_submissions_df[
    f1_submissions_df['title'].apply(lambda title: steward_decision_pattern.search(title) is not None)].copy()

with display_full_dataframe():
    print(len(steward_decision_submissions_df))
    display(steward_decision_submissions_df.head(2))

In [None]:



for index, steward_decision_submission in steward_decision_submissions_df.iterrows():
    submission_link_id = f't3_{steward_decision_submission['id']}'
    comments_df = f1_comments_df[f1_comments_df['link_id'] == submission_link_id].copy()

    # display(comments_df)
    # print(comments_df.empty)
    # x = comments_df.empty
    # break

    if comments_df.empty:
        steward_decision_submissions_df.loc[index, 'average_sentiment_vader'] = np.nan
        continue
    
    NOEMER = np.abs(comments_df['score']).sum()
    
    if NOEMER == 0:
        steward_decision_submissions_df.loc[index, 'average_sentiment_vader'] = np.nan
        continue


    comments_df.loc[:, 'compound'] = comments_df['body'].apply(
        lambda text: analyzer.polarity_scores(text)['compound']
    )   #creates a data series with a score (float) given an input comment

    average_sentiment = (comments_df['compound'] * comments_df['score']).sum() / NOEMER
    steward_decision_submissions_df.loc[index, 'average_sentiment_vader'] = average_sentiment

    # score = 0
    # compound_list = []
    # for _, comment in comments_df.iterrows():
    #     compound_list.append(analyzer.polarity_scores(comment['body'])['compound'])
    #     comment_score = comments_df['score']
    #     score += np.abs(comment_score)

    # compound_sum = np.array(compound_list).sum()
    # x = comments_df['compound'] * comments_df['score'] / compound_sum
    # print(Vader_sentiment(average_sentiment))
    # print(average_sentiment)

with display_full_dataframe():
    display(steward_decision_submissions_df)
    display(steward_decision_submissions_df['average_sentiment_vader'])


In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

def BERT_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    sentiment_score = torch.argmax(logits, dim=1).item()  # 0: negative, 1: neutral, 2: positive
    return (sentiment_score - 1)

for index, steward_decision_submission in steward_decision_submissions_df.iterrows():
    submission_link_id = f't3_{steward_decision_submission['id']}'
    comments_df = f1_comments_df[f1_comments_df['link_id'] == submission_link_id].copy()

    if comments_df.empty:
        steward_decision_submissions_df.loc[index, 'average_sentiment_bert'] = np.nan
        continue
    
    NOEMER = np.abs(comments_df['score']).sum()
    
    if NOEMER == 0:
        steward_decision_submissions_df.loc[index, 'average_sentiment_bert'] = np.nan
        continue

    comments_df.loc[:, 'compound'] = comments_df['body'].apply(BERT_sentiment)
        # lambda text: analyzer.polarity_scores(text)['compound']
        # bert induced sentiment

    average_sentiment = (comments_df['compound'] * comments_df['score']).sum() / NOEMER
    steward_decision_submissions_df.loc[index, 'average_sentiment_bert'] = average_sentiment

# with display_full_dataframe():
    # display(steward_decision_submissions_df)
    # display(steward_decision_submissions_df['average_sentiment_bert'])

In [None]:
print(
    np.abs(steward_decision_submissions_df['average_sentiment_bert'] - steward_decision_submissions_df['average_sentiment_vader']).sum() \
    / len(steward_decision_submissions_df.index)
)

with display_full_dataframe():
    display(steward_decision_submissions_df)