# Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
import os
import torch

  from .autonotebook import tqdm as notebook_tqdm


Loading FinBERT model

In [2]:
sentiment_analyzer = pipeline("sentiment-analysis", model="ProsusAI/finbert")

Device set to use cpu


In [3]:
cryptos = ['BTC', 'ETH', 'XRP']

In [4]:
def get_sentiment_score(text):

    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0
    
    result = sentiment_analyzer(text, truncation=True, max_length=512)[0]
    label = result['label']
    score = result['score']
    
    if label == 'positive':
        return score
    elif label == 'negative':
        return -score
    else: 
        return 0.0

In [5]:
def weighted_average(group):
        total_weighted_value = group['weighted_sentiment_value'].sum()
        total_weight = group['weight'].sum()
        return total_weighted_value / total_weight if total_weight > 0 else 0.0

In [None]:
for crypto in cryptos:
    print(f"Starting processing for {crypto}")

    input_filename = f"reddit_data_{crypto}_24_25.csv"
    input_filepath = os.path.join(r'C:\Users\madha\Desktop\Dissertation\Data\Reddit API', input_filename)
    
    df = pd.read_csv(input_filepath)

    df['title'] = df['title'].fillna('')
    df['body'] = df['body'].fillna('')

    df['text_to_analyze'] = np.where(
        df['type'] == 'post', 
        df['title'] + ' ' + df['body'], 
        df['body']
    )

    print(f"Analyzing sentiment for {crypto}.")
    df['sentiment_score'] = df['text_to_analyze'].apply(get_sentiment_score)


    print(f"Aggregating daily sentiment scores for {crypto}.")
    
    # Preparing data for weighted average calculation
    df['weight'] = np.maximum(1, df['score'])
    df['weighted_sentiment_value'] = df['sentiment_score'] * df['weight']
    
    # Grouping by date
    daily_groups = df.groupby('date')

    # Apply the formula
    daily_sentiment = daily_groups.apply(weighted_average).to_frame(name='daily_weighted_sentiment')
    daily_sentiment['comment_volume'] = daily_groups['text_to_analyze'].count()
    daily_sentiment.reset_index(inplace=True)


    start_year = pd.to_datetime(daily_sentiment['date']).min().strftime('%y')
    end_year = pd.to_datetime(daily_sentiment['date']).max().strftime('%y')
    
    output_filename = f"sentiment_scores_{crypto}_{start_year}_{end_year}.csv"
    output_filepath = os.path.join(r'C:\Users\madha\Desktop\Dissertation\Data\Sentiment Scores', output_filename)
    
    daily_sentiment.to_csv(output_filepath, index=False, encoding='utf-8-sig')
    
    print(f"\n Final daily scores for {crypto} saved.")

print("\n\n All files have been processed successfully!")


Starting processing for BTC
Analyzing sentiment for BTC.
Aggregating daily sentiment scores for BTC.

 Success! Final daily scores for BTC saved.
Starting processing for ETH
Analyzing sentiment for ETH.


  daily_sentiment = daily_groups.apply(weighted_average).to_frame(name='daily_weighted_sentiment')


Aggregating daily sentiment scores for ETH.

 Success! Final daily scores for ETH saved.
Starting processing for XRP
Analyzing sentiment for XRP.


  daily_sentiment = daily_groups.apply(weighted_average).to_frame(name='daily_weighted_sentiment')


Aggregating daily sentiment scores for XRP.

 Success! Final daily scores for XRP saved.


 All files have been processed successfully!


  daily_sentiment = daily_groups.apply(weighted_average).to_frame(name='daily_weighted_sentiment')
