# Preprocess Raw Data to be ready to Train

## Purpose

This notebook has the purpose to prepare the dataset to be ready use for the Deep Neural Networks Architecture that will be use for classify and compare with the paper PerceptSent.

## Import Libraries

In [11]:
import os
import shutil
import pandas as pd
import nltk
import gensim
from tqdm.notebook import tqdm
# Deep learning packages
import torch
from sklearn.model_selection import train_test_split

from collections import Counter
# from textaugment import Word2vec
# from textaugment import Fasttext
# from textaugment import Wordnet
# from textaugment import Translate
# from textaugment import Word2vec, Fasttext
# from textaugment import Translate

# Visualization packages
import matplotlib.pyplot as plt
import seaborn

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

## Load DataFrame

In [None]:
!ls ..

In [None]:
# df = pd.read_csv("../data/raw/image_caption_mini_GPT4.csv") # captioning data for miniGPT-4
# df = pd.read_csv("../data/raw/image_caption_blip.csv") # captioning data for Blip Large model
df_mini = pd.read_csv("../data/raw/image_caption_minigpt4_completed.csv") # captioning data for miniGPT-4
df = pd.read_csv("../data/raw/image_caption_gpt4_openai.csv", delimiter=';') # captioning data for miniGPT-4
df["url"] = ["https://drive.google.com/uc?export=view&id="+str(id) for id in df["id"]]
display(df.head())
display(df.tail())

In [None]:
df.shape

In [6]:
# df["id"] = df["image_path"].apply(lambda x: x.split('/')[-1].split('.')[0]) # for miniGPT4

In [None]:
df.shape

In [None]:
df.head()

In [9]:
# df["caption"] = df["caption"].apply(lambda x: x.replace("The image shows", '').replace("This is", '').replace("This image", '').replace("The image", '').replace("\u200b", '').replace("\n", ' ').replace("<Img>", ''))

In [10]:
# model = gensim.models.KeyedVectors.load_word2vec_format('../models/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [25]:
import json

f = open("../data/raw/dataset.json")
 
# returns JSON object as 
# a dictionary
data = json.load(f)

f.close()

In [26]:
sentiment_dict = {
    "SlightlyNegative": "Negative",
    # "SlightlyNegative": "SlightlyNegative",
    "SlightlyPositive": "Positive",
    # "SlightlyPositive": "SlightlyPositive",
    "Neutral": "Neutral",
    "Positive": "Positive",
    "Negative": "Negative"
}

In [None]:
def ia_calculation(sg):
    """
    Calcula a métrica IA (Image Agreement) com base no vetor de sentimento agrupado (sg).

    Parâmetros:
    sg (list ou tuple): Vetor de sentimento agrupado, onde cada elemento representa o número de votos para uma categoria de sentimento.

    Retorna:
    float: A métrica IA.
    """
    total_avaliadores = len(sg)
    max_votos = max(sg)
    IA = max_votos / total_avaliadores
    return IA

sentiment_data = {}

for id, caption in tqdm(zip(df["id"], df["caption"]), total=len(df), desc="Create supervised dataset"):
    sentiment_data[id] = {
        "sparse sentiment": [],
        "cluster sentiment": [],
        "perceptions": [],
        "caption": caption,
        "image_agreement": float,
    }

simple_sentiment = {
    "Positive": "Positive",
    "SlightlyPositive": "Positive",
    # "SlightlyPositive": "SlightlyPositive",
    "Neutral": "Neutral",
    "SlightlyNegative": "Negative",
    # "SlightlyNegative": "SlightlyNegative",
    "Negative": "Negative"
}
# sentiment_idx = {
#     "Positive": 4,
#     "SlightlyPositive": 3,
#     "Neutral": 2,
#     "SlightlyNegative": 1,
#     "Negative": 0
# }
sentiment_idx = {
    "Positive": 2,
    "Neutral": 0,
    "Negative": 1
}


for samples in tqdm(data["tasks"], desc="Image perceptions"):
    for sample in samples["images"]:
        id = sample["id"]
        sentiment = sample["sentiment"]
        # perceptions = ', '.join([str(per) for per in sample["perceptions"]])
        perceptions = [str(per) for per in sample["perceptions"]]
        
        if id in sentiment_data:
            # sentiment = [simple_sentiment[sent] for sent in sentiment]
            sentiment_data[id]["sparse sentiment"].append(sentiment)
            sentiment_data[id]["cluster sentiment"].append(simple_sentiment[sentiment])
            for perception in perceptions:
                sentiment_data[id]["perceptions"].append(perception)

for id in tqdm(sentiment_data, desc="Image agreement calculation"):
    sentiment = sentiment_data[id]["sparse sentiment"]
    sg = [0 for _ in range(len(sentiment))]
    counter = Counter(sentiment)
    for key in counter:
        sg[sentiment_idx[key]] = counter[key]
    sentiment_data[id]["image_agreement"] = ia_calculation(sg)

In [None]:
import warnings
warnings.filterwarnings("ignore")
data = {
    "text": [],
    "target": [],
}

# sentiment_values = {
#     "Negative": 0,
#     "SlightlyNegative": 1,
#     "Neutral": 2,
#     "SlightlyPositive": 3,
#     "Positive": 4,
# }
sentiment_values = {
    "Negative": 0,
    "Neutral": 0,
    "Positive": 1,
}

for id in tqdm(sentiment_data, desc="Zero-shot-classification progress"):
    caption = sentiment_data[id]["caption"]
    sentiment = list(sentiment_data[id]["cluster sentiment"])
    unique_perceptions = list(set(sentiment_data[id]["perceptions"]))
    # max_value = result[pred]
    ia = sentiment_data[id]["image_agreement"]    

    label = Counter(sentiment)
    counter = Counter(sentiment)
    # print(sentiment)
    # print(label)
    # print(counter)
    most_common_sentiment, frequency = counter.most_common(1)[0]
    if (frequency >= 5): # 3 -> alpha3; 4 -> alpha4; 5 -> alpha5
        data["text"].append(caption)
        data["target"].append(sentiment_values[most_common_sentiment])

In [None]:
data["text"][0], data["target"][0]

## Analyze the Target Distribution

In [35]:
def plot_sentiment_distribution(df):
    """"
    Plots the distribution of sentiments
    Parameters:
        df(dataframe): DataFrame containing the "sentiment" column.

    Returns:
        None
    """

    mapping = {0: "Negative", 1: "Positive"}
    # mapping = {1: "Negative", 0: "Neutral", 2: "Positive"}
    # mapping = {0: "Negative", 1: "SlightlyNegative", 2: "Neutral", 3: "SlightlyPositive", 4: "Positive"}
    df["sentiment"] = df["sentiment"].map(mapping)
    sentiment_counts = df["sentiment"].value_counts()

    plt.figure(figsize=(6, 4))
    sentiment_counts.plot(kind="bar", color="skyblue")
    plt.title("Sentiment Distribution")
    plt.xlabel("Sentiment")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
plot_sentiment_distribution(pd.DataFrame({"sentiment": data["target"]}))

In [None]:
data["text"][:5], data["target"][:5]

In [None]:
len(data["target"])

## Train & Test Split

## Save DataFrames

In [55]:
# train_df.to_csv("../data/train/train.csv", index=False)
# # aug_train_df.to_csv("../data/train/aug_train.csv", index=False)
# val_df.to_csv("../data/validation/val.csv", index=False)
# test_df.to_csv("../data/test/test.csv", index=False)

In [51]:
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha3_p5.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha3_p3.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha3_p2plus.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha3_p2neg.csv", index=False)

In [95]:
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha4_p5.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha4_p3.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha4_p2plus.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha4_p2neg.csv", index=False)

In [39]:
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha5_p5.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha5_p3.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha5_p2plus.csv", index=False)
# pd.DataFrame({"text": data["text"], "sentiment": data["target"]}).to_csv("../data/percept_dataset_alpha5_p2neg.csv", index=False)

# Generate Data to MiniGPT 4 classify sentiment.

In [6]:
import pandas as pd
import warnings
from tqdm.notebook import tqdm
warnings.filterwarnings("ignore")

In [None]:
# df = pd.read_csv("../data/raw/image_caption_minigpt4_completed.csv") # captioning data for miniGPT-4
df = pd.read_csv("../data/raw/image_caption_gpt4_openai.csv", delimiter=';') # captioning data for miniGPT-4
df["url"] = ["https://drive.google.com/uc?export=view&id="+str(id) for id in df["id"]]
df.head()

In [8]:
import json

f = open("../data/raw/dataset.json")
data_json = json.load(f)

f.close()

In [9]:
def ia_calculation(sg):
    """
    Calcula a métrica IA (Image Agreement) com base no vetor de sentimento agrupado (sg).

    Parâmetros:
    sg (list ou tuple): Vetor de sentimento agrupado, onde cada elemento representa o número de votos para uma categoria de sentimento.

    Retorna:
    float: A métrica IA.
    """
    total_avaliadores = len(sg)
    max_votos = max(sg)
    IA = max_votos / total_avaliadores
    return IA

In [None]:
for approach in range(0, 4):
    if approach == 0: ## P5
        p = 'p5'
        sentiment_dict = {
            "SlightlyNegative": "SlightlyNegative",
            "SlightlyPositive": "SlightlyPositive",
            "Neutral": "Neutral",
            "Positive": "Positive",
            "Negative": "Negative"
        }
        simple_sentiment = {
            "Positive": "Positive",
            "SlightlyPositive": "SlightlyPositive",
            "Neutral": "Neutral",
            "SlightlyNegative": "SlightlyNegative",
            "Negative": "Negative"
        }
        sentiment_idx = {
            "Positive": 4,
            "SlightlyPositive": 3,
            "Neutral": 2,
            "SlightlyNegative": 1,
            "Negative": 0
        }
        sentiment_values = {
            "Negative": 0,
            "SlightlyNegative": 1,
            "Neutral": 2,
            "SlightlyPositive": 3,
            "Positive": 4,
        }
    elif approach == 1: ## P3
        p = 'p3'
        sentiment_dict = {
            "SlightlyNegative": "Negative",
            "SlightlyPositive": "Positive",
            "Neutral": "Neutral",
            "Positive": "Positive",
            "Negative": "Negative"
        }
        simple_sentiment = {
            "Positive": "Positive",
            "SlightlyPositive": "Positive",
            "Neutral": "Neutral",
            "SlightlyNegative": "Negative",
            "Negative": "Negative"
        }
        sentiment_idx = {
            "SlightlyPositive": 2,
            "Positive": 2,
            "Neutral": 0,
            "Negative": 1,
            "SlightlyNegative": 1,
        }
        sentiment_values = {
            "Negative": 1,
            "Neutral": 0,
            "Positive": 2,
        }
    elif approach == 2: ## P2+
        p = 'p2plus'
        sentiment_dict = {
            "SlightlyNegative": "Negative",
            "SlightlyPositive": "Positive",
            "Neutral": "Positive",
            "Positive": "Positive",
            "Negative": "Negative"
        }
        simple_sentiment = {
            "Positive": "Positive",
            "SlightlyPositive": "Positive",
            "Neutral": "Positive",
            "SlightlyNegative": "Negative",
            "Negative": "Negative"
        }
        sentiment_idx = {
            "SlightlyPositive": 0,
            "Positive": 0,
            "Neutral": 0,
            "Negative": 1,
            "SlightlyNegative": 1,
        }
        sentiment_values = {
            "Negative": 1,
            "Neutral": 0,
            "Positive": 0,
        }
    elif approach == 3: ## P2-
        p = 'p2neg'
        sentiment_dict = {
            "SlightlyNegative": "Negative",
            "SlightlyPositive": "Positive",
            "Neutral": "Negative",
            "Positive": "Positive",
            "Negative": "Negative"
        }
        simple_sentiment = {
            "Positive": "Positive",
            "SlightlyPositive": "Positive",
            "Neutral": "Negative",
            "SlightlyNegative": "Negative",
            "Negative": "Negative"
        }
        sentiment_idx = {
            "SlightlyPositive": 1,
            "Positive": 1,
            "Neutral": 0,
            "Negative": 0,
            "SlightlyNegative": 0,
        }
        sentiment_values = {
            "Negative": 0,
            "Neutral": 0,
            "Positive": 1,
        }

    
    sentiment_data = {}

    for id, caption in tqdm(zip(df["id"], df["caption"]), total=len(df), desc="Create supervised dataset"):
        sentiment_data[id] = {
            "sparse sentiment": [],
            "cluster sentiment": [],
            "perceptions": [],
            "caption": caption,
            "image_agreement": float,
        }

        
    for samples in tqdm(data_json["tasks"], desc="Image perceptions"):
        for sample in samples["images"]:
            id = sample["id"]
            sentiment = sample["sentiment"]
            # perceptions = ', '.join([str(per) for per in sample["perceptions"]])
            perceptions = [str(per) for per in sample["perceptions"]]
            
            if id in sentiment_data:
                # sentiment = [simple_sentiment[sent] for sent in sentiment]
                sentiment_data[id]["sparse sentiment"].append(sentiment)
                sentiment_data[id]["cluster sentiment"].append(simple_sentiment[sentiment])
                for perception in perceptions:
                    sentiment_data[id]["perceptions"].append(perception)

    for id in tqdm(sentiment_data, desc="Image agreement calculation"):
        sentiment = sentiment_data[id]["sparse sentiment"]
        sg = [0 for _ in range(len(sentiment))]
        counter = Counter(sentiment)
        for key in counter:
            # if p == 'p3':
            #     print(sg)
            #     print(key)
            #     print(sentiment_idx)
            sg[sentiment_idx[key]] = counter[key]
        sentiment_data[id]["image_agreement"] = ia_calculation(sg)
    
    for f in range(3, 6):
    
        data = {
            "text": [],
            "target": [],
            "id": []
        }
        for id in tqdm(sentiment_data, desc="Generate datasets"):
            caption = sentiment_data[id]["caption"]
            sentiment = list(sentiment_data[id]["cluster sentiment"])
            unique_perceptions = list(set(sentiment_data[id]["perceptions"]))
            ia = sentiment_data[id]["image_agreement"]    

            label = Counter(sentiment)
            counter = Counter(sentiment)
            most_common_sentiment, frequency = counter.most_common(1)[0]
            if (frequency >= f): # 3 -> alpha3; 4 -> alpha4; 5 -> alpha5
                data["text"].append(caption)
                data["target"].append(sentiment_values[most_common_sentiment])
                data["id"].append(id)

        pd.DataFrame({"id": data["id"], 
                      "text": data["text"], 
                      "sentiment": data["target"]}).to_csv(
                          f"../data/gpt4-openai-classify/percept_dataset_alpha{f}_{p}.csv", index=False)
    
