# Sentiment Analysis Notebook

In [None]:
import polars as pl
import os
import sys
sys.path.append("../src/")

from adding_metadata.replies import add_reply_list
from adding_metadata.reply_sentiments import *

In [None]:
# Set the path to the data
##Location of reddit.parquet
base_dir = "../../" 

##To store the data splits
data_dir = os.path.join(base_dir, "data/") 

##To store results of sentiment analysis
results_dir = os.path.join(data_dir, "results/") 

##To update the data splits with sentiment analysis results
processed_dir = os.path.join(data_dir, "processed/")

In [None]:
# Split the data into parts each with ~50k rows
df = pl.read_parquet(os.path.join(base_dir,"reddit.parquet"))
num_partitions = 100
chunk_size = (len(df) + num_partitions - 1) // num_partitions  
small_dfs = [df[i:min(i + chunk_size, len(df))] for i in range(0, len(df), chunk_size)]
for idx, small_df in enumerate(small_dfs):
    output_path = os.path.join(data_dir,f'split_{idx + 1}.parquet')
    small_df.write_parquet(output_path)

In [None]:
# Run sentiment analysis on each split and save the results
all_files = get_all_files(data_dir)
for file in all_files:
    data = TextLoader(file=file, tokenizer=tokenizer)
    train_dataloader = DataLoader(data, batch_size=50, shuffle=False)
    out=[]
    for i, data in enumerate(train_dataloader):
        input = data.to(device_staging)
        res = model(input)
        out.append(res['logits'].cpu().data)
    filename = file.stem
    output_file = 'results/' + filename + '.npy'
    with open(output_file, 'wb') as f:
        f.write(pickle.dumps(out))
    shutil.move(file, os.path.join(processed_dir, file.name))
    del data, train_dataloader, input, res, out
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# Update dataframes with sentiments
for idx in range(num_partitions):
    parquet_path = f"{processed_dir}/split_{idx + 1}.parquet"
    npy_file_path = f"{results_dir}/split_{idx + 1}.npy"
    update_dataframe_with_sentiments(base_dir, parquet_path, npy_file_path)

In [None]:
# Combine updated split parquet files into one
combine_parquet_files(processed_dir, f"{base_dir}/reddit_updated_with_sentiments.parquet")

In [None]:
# Load the updated DataFrame and add reply list and summed sentiments
df_new = pl.read_parquet(f"{base_dir}/reddit_updated_with_sentiments.parquet")
df_new = replies.add_reply_list(df_new)
df_new = add_summed_sentiments(df_new)

In [None]:
# Save the final DataFrame with summed sentiments
df_new.write_parquet(f"{base_dir}/reddit_updated_with_sentiments.parquet", compression='zstd')