In [1]:
import pandas as pd
import nlpaug.augmenter.word as naw
from collections import Counter

In [2]:
df = pd.read_excel('DATA_POLICY.xlsx')

In [4]:
min_samples = 500 #Nombre minimum d'enregistrements souhaité pour chaque classe

# Augmentateur NLPAug basé sur BERT pour le paraphrasage
augmenter = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action='substitute', top_k=10
)

In [5]:
# Fonction pour générer des données augmentées
def augment_class(df, category, augmenter, target_size):
    class_data = df[df['Category'] == category]
    augmented_texts = []
    
    while len(class_data) + len(augmented_texts) < target_size:
        for _, row in class_data.iterrows():
            if len(class_data) + len(augmented_texts) >= target_size:
                break
            augmented_text = augmenter.augment(row['Content'])
            augmented_texts.append({'Category': category, 'Content': augmented_text})
    
    return pd.DataFrame(augmented_texts)

In [6]:
# Compter le nombre d'échantillons par classe
class_counts = Counter(df['Category'])
print("Répartition des classes avant augmentation :", class_counts)

Répartition des classes avant augmentation : Counter({'Coronavirus': 5008, 'Economy': 4992, 'Europe': 4501, 'Africa': 3305, 'Sports': 2342, 'Science & Technology': 523, 'Climate Crisis': 502, 'Middle East': 501, 'Asia': 271, 'Opinion': 14})


In [None]:
# Appliquer l'augmentation pour les classes sous-représentées
augmented_data = []
for category, count in class_counts.items():
    if count < min_samples:
        print(f"Augmenting class: {category}")
        augmented_data.append(augment_class(df, category, augmenter, min_samples))

Augmenting class: Middle East


In [11]:
augmented_data

[    Category                                            Content
 0    Opinion  [" " the far east : goodbye america, hello chi...
 1    Opinion  [russia faces a new neighbourhood threat : chi...
 2    Opinion  [" " kazakhstan is a bridge between the north,...
 3    Opinion  [" " in somalia, the rains have come but the c...
 4    Opinion  [" " sanctions on russian may not be working, ...
 ..       ...                                                ...
 481  Opinion  [america and the lula and right on global affa...
 482  Opinion  [‘ king ’ modi ’ s sceptre and the wrestlers w...
 483  Opinion  [change is coming to capitol hill and israel w...
 484  Opinion  [why ‘ white ’ supremacists but not always whi...
 485  Opinion  [and post - pandemic homage to catalonia, " " ...
 
 [486 rows x 2 columns],
     Category                                            Content
 0       Asia  [new trials show increased survival rates for ...
 1       Asia  [dozens of bodies remain unclaimed days after ..

In [12]:
# Combiner les données originales et augmentées
augmented_df = pd.concat([df] + augmented_data, ignore_index=True)

In [13]:
# Vérifier la nouvelle répartition des classes
new_class_counts = Counter(augmented_df['Category'])
print("Répartition des classes après augmentation :", new_class_counts)

Répartition des classes après augmentation : Counter({'Coronavirus': 5008, 'Economy': 4992, 'Europe': 4501, 'Africa': 3305, 'Sports': 2342, 'Science & Technology': 523, 'Climate Crisis': 502, 'Middle East': 501, 'Opinion': 500, 'Asia': 500})


In [14]:
# Sauvegarder le dataset augmenté
augmented_data_path = "augmented_dataset.csv"
augmented_df.to_csv(augmented_data_path, index=False)
print(f"Dataset augmenté sauvegardé sous : {augmented_data_path}")

Dataset augmenté sauvegardé sous : augmented_dataset.csv
