# Vader Experiments

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

from tqdm.notebook import tqdm
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import KFold

from sklearn.metrics import f1_score, accuracy_score

from scipy import stats

In [None]:
nltk.download('vader_lexicon')

In [None]:
data_path_open_source = "/home/neemias/PerceptSent-LLM-approach/data/"
data_path_open_ai = "/home/neemias/PerceptSent-LLM-approach/data/gpt4-openai-classify/"
data_paths = [os.path.join(data_path_open_source, f) 
              for f in os.listdir(data_path_open_source) if f.endswith('.csv')] + [os.path.join(data_path_open_ai, f)
                                                              for f in os.listdir(data_path_open_ai) if f.endswith('.csv')]
del data_paths[6]
data_paths[:10]

## Sentiment Analyzer

In [None]:
for data_path in tqdm(data_paths):
    df = pd.read_csv(data_path)
    df_metrics = pd.DataFrame([])
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(df)):
        train_df = pd.DataFrame({"text": df["text"].iloc[train_idx].to_list(), 
                                        "sentiment": df["sentiment"].iloc[train_idx].to_list()})
        val_df = pd.DataFrame({"text": df["text"].iloc[val_idx].to_list(), 
                                    "sentiment": df["sentiment"].iloc[val_idx].to_list()})
        
        model = SentimentIntensityAnalyzer()

        text = val_df["text"].to_list()
        target = val_df["sentiment"].to_list()

        sentiments = np.unique(target)

        if (len(sentiments) == 3):
            sent_dic = {
                "neg": 1,
                "neu": 0,
                "pos": 2,
            }
        elif (len(sentiments) == 2):
            # P2+
            # sent_dic = {
            #     "neg": 1,
            #     "neu": 0,
            #     "pos": 0,
            # }
            # P2 -
            sent_dic = {
                "neg": 0,
                "neu": 0,
                "pos": 1,
            }

        pred = []
        for t in text:
            result = model.polarity_scores(t)
            del result["compound"]
            # print(f"Result: {result}") # For debug
            max_key = max(result, key=result.get)
            max_value = result[max_key]
            pred.append(sent_dic[max_key])

        accuracy_val = accuracy_score(target, pred)
        f1_val = f1_score(target, pred, average="weighted")    
        df_metrics = pd.concat([df_metrics, pd.DataFrame({"accuracy": [accuracy_val], "f1_score": [f1_val]
                                                            })], axis=0)
    if (len(data_path.split('/')[-2].split('-')) > 2):
        flag = "openai"
    else:
        flag = "opensource"
    # display(df_metrics.head())
    if (len(sentiments) <= 3):
        df_metrics.to_csv(f"/home/neemias/PerceptSent-LLM-approach/experiments/vader-experiment/{flag}-{data_path.split('/')[-1]}",
                        index=False)

In [None]:
csv_files = [os.path.join("/home/neemias/PerceptSent-LLM-approach/experiments/vader-experiment", f) 
             for f in os.listdir("/home/neemias/PerceptSent-LLM-approach/experiments/vader-experiment") if f.endswith(".csv")]

for csv_file in csv_files:
    df_metrics = pd.read_csv(csv_file)
    f1_scores = df_metrics["f1_score"].to_list()
    mean_f1 = np.mean(f1_scores)


    # define the confidence level
    confidence_level = 0.95
    degrees_freedon = len(f1_scores)-1

    confidence_interval = stats.t.interval(
        confidence_level, 
        degrees_freedon, 
        loc=mean_f1, 
        scale=stats.sem(f1_scores)
    )

    print(f"\n\nProblem: {csv_file.split('/')[-1].split('.')[0]}")
    print(f"Max F1-score: {max(f1_scores)}")
    print(f"Average F1-score: {mean_f1}")
    print(f"Confidence interval 95%: {confidence_interval}")
    print(f"Inteval: {abs(confidence_interval[0]-mean_f1)} - Interval: {abs(confidence_interval[1]-mean_f1)}")
    confidence_interval = stats.t.interval(
        confidence_level, 
        degrees_freedon, 
        loc=max(f1_scores), 
        scale=stats.sem(f1_scores)
    )

    print(f"Inteval: {abs(confidence_interval[0]-max(f1_scores))} - Interval: {abs(confidence_interval[1]-max(f1_scores))}")