# Vader Experiments

## Import Libraries

In [2]:
import pandas as pd
import numpy as np

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import KFold

from sklearn.metrics import f1_score, accuracy_score

from scipy import stats

In [None]:
nltk.download('vader_lexicon')

In [None]:
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha3_p5.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha3_p5.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha3_p3.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha3_p3.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha3_p2plus.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha3_p2plus.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha3_p2neg.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha3_p2neg.csv")

### Load experiments for alpha 4
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha4_p5.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha4_p5.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha4_p3.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha4_p3.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha4_p2plus.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha4_p2plus.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha4_p2neg.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha4_p2neg.csv")

### Load experiments for alpha 5
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha5_p5.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha5_p5.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha5_p3.csv")
df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha5_p3.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha5_p2plus.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha5_p2plus.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/percept_dataset_alpha5_p2neg.csv")
# df = pd.read_csv("/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/percept_dataset_alpha5_p2neg.csv")

df.head()

## Sentiment Analyzer

In [None]:
df_metrics = pd.DataFrame([])
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(kfold.split(df)):
    train_df = pd.DataFrame({"text": df["text"].iloc[train_idx].to_list(), 
                                     "sentiment": df["sentiment"].iloc[train_idx].to_list()})
    val_df = pd.DataFrame({"text": df["text"].iloc[val_idx].to_list(), 
                                "sentiment": df["sentiment"].iloc[val_idx].to_list()})
    
    model = SentimentIntensityAnalyzer()

    text = val_df["text"].to_list()
    target = val_df["sentiment"].to_list()

    sentiments = np.unique(target)

    if (len(sentiments) == 3):
        sent_dic = {
            "neg": 1,
            "neu": 0,
            "pos": 2,
        }
    elif (len(sentiments) == 2):
        # P2+
        # sent_dic = {
        #     "neg": 1,
        #     "neu": 0,
        #     "pos": 0,
        # }
        # P2 -
        sent_dic = {
            "neg": 0,
            "neu": 0,
            "pos": 1,
        }

    pred = []
    for t in text:
        result = model.polarity_scores(t)
        del result["compound"]
        # print(f"Result: {result}") # For debug
        max_key = max(result, key=result.get)
        max_value = result[max_key]
        pred.append(sent_dic[max_key])

    accuracy_val = accuracy_score(target, pred)
    f1_val = f1_score(target, pred, average="weighted")    
    df_metrics = pd.concat([df_metrics, pd.DataFrame({"accuracy": [accuracy_val], "f1_score": [f1_val]
                                                         })], axis=0)
df_metrics.head()

In [None]:
f1_scores = df_metrics["f1_score"].to_list()
mean_f1 = np.mean(f1_scores)


# define the confidence level
confidence_level = 0.95
degrees_freedon = len(f1_scores)-1

confidence_interval = stats.t.interval(
    confidence_level, 
    degrees_freedon, 
    loc=mean_f1, 
    scale=stats.sem(f1_scores)
)

print(f"Max F1-score: {max(f1_scores)}")
print(f"Average F1-score: {mean_f1}")
print(f"Confidence interval 95%: {confidence_interval}")
print(f"Inteval: {abs(confidence_interval[0]-mean_f1)} - Interval: {abs(confidence_interval[1]-mean_f1)}")
confidence_interval = stats.t.interval(
    confidence_level, 
    degrees_freedon, 
    loc=max(f1_scores), 
    scale=stats.sem(f1_scores)
)

print(f"Inteval: {abs(confidence_interval[0]-max(f1_scores))} - Interval: {abs(confidence_interval[1]-max(f1_scores))}")