In [None]:
import pandas as pd
import random
from sklearn.utils import resample

In [None]:
# Charger le dataset d'origine
df = pd.read_csv("absa_final_annotated_dataset.csv")

# Séparer les classes
df_positive = df[df['label'] == 'positive']
df_negative = df[df['label'] == 'negative']
df_neutral = df[df['label'] == 'neutral']

# Définir la cible d'équilibrage
target_count = len(df_positive)
num_to_generate_negative = target_count - len(df_negative)
num_to_generate_neutral = target_count - len(df_neutral)

In [None]:
# Fonction de paraphrase simple en anglais
def simple_paraphrase_en(sentence, aspect, sentiment):
    templates_negative = [
        f"I am not happy with the {aspect}.",
        f"The {aspect} is problematic.",
        f"{aspect.capitalize()} was truly disappointing.",
        f"I had a bad experience with the {aspect}.",
        f"The {aspect} did not meet my expectations."
    ]
    templates_neutral = [
        f"The {aspect} was fine.",
        f"I have no strong feelings about the {aspect}.",
        f"{aspect.capitalize()} was okay, nothing special.",
        f"The {aspect} is acceptable.",
        f"My opinion on the {aspect} is neutral."
    ]
    if sentiment == 'negative':
        return random.choice(templates_negative)
    elif sentiment == 'neutral':
        return random.choice(templates_neutral)
    else:
        return sentence

In [None]:
# Génération des données négatives
generated_negative = []
for i in range(num_to_generate_negative):
    row = df_negative.sample(1, random_state=42+i).iloc[0]
    new_sentence = simple_paraphrase_en(row['sentence'], row['aspect'], 'negative')
    generated_negative.append({
        "review_id": f"gen_neg_{i}",
        "sentence": new_sentence,
        "aspect": row['aspect'],
        "original_rating": row['original_rating'],
        "sentiment_initial": row['sentiment_initial'],
        "label": "negative"
    })

In [None]:
# Génération des données neutres
generated_neutral = []
for i in range(num_to_generate_neutral):
    row = df_neutral.sample(1, random_state=999+i).iloc[0]
    new_sentence = simple_paraphrase_en(row['sentence'], row['aspect'], 'neutral')
    generated_neutral.append({
        "review_id": f"gen_neu_{i}",
        "sentence": new_sentence,
        "aspect": row['aspect'],
        "original_rating": row['original_rating'],
        "sentiment_initial": row['sentiment_initial'],
        "label": "neutral"
    })

In [None]:
# Conversion en DataFrame
df_generated_negative = pd.DataFrame(generated_negative)
df_generated_neutral = pd.DataFrame(generated_neutral)

In [None]:
# Fusion finale
df_final = pd.concat([
    df_positive,
    df_negative,
    df_generated_negative,
    df_neutral,
    df_generated_neutral
]).sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
# Sauvegarde
df_final.to_csv("absa_balanced_augmented_dataset_en.csv", index=False)

print("Dataset generated and saved as 'absa_balanced_augmented_dataset_en.csv'")
print("Class distribution:")
print(df_final['label'].value_counts())

Dataset generated and saved as 'absa_balanced_augmented_dataset_en.csv'
Class distribution:
label
negative    5990
neutral     5990
positive    5990
Name: count, dtype: int64


In [None]:
df_final.head()

Unnamed: 0,review_id,sentence,aspect,original_rating,sentiment_initial,label
0,2077,Poor fabric quality Looks great in the picture...,mail,1,negative,negative
1,gen_neu_3928,My opinion on the reference is neutral.,reference,2,negative,neutral
2,15794,So disappointing I love the photo of this and ...,reviews,1,negative,positive
3,14696,the fabric is pretty.,pretty,3,neutral,positive
4,gen_neg_629,The large is problematic.,large,5,positive,negative
