In [None]:
!pip install nlpaug



In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import time
from tqdm import tqdm

data = pd.read_csv('/content/copopsofinaldataset.csv')
data = data[['CO Description', 'PO/PSO Description', 'Score (0-3)']]

In [None]:
score_counts = data['Score (0-3)'].value_counts()
print(score_counts)

Score (0-3)
0    1134
3     580
2      84
1      42
Name: count, dtype: int64


In [None]:
#Augmentation
import pandas as pd
import nlpaug.augmenter.word as naw
import random

data = pd.read_csv('/content/copopsofinaldataset.csv')

aug = naw.SynonymAug(aug_p=0.3)

def augment_text_with_synonyms(text, n=1):
    augmented_texts = []
    for _ in range(n):
        augmented_text = aug.augment(text)
        augmented_texts.append(augmented_text[0])
    return augmented_texts

balanced_data = pd.DataFrame()

target_samples = 2500

for score in range(4):
    class_data = data[data['Score (0-3)'] == score]
    num_needed = target_samples - len(class_data)

    augmented_samples = []
    if num_needed > 0:
        for _, row in class_data.iterrows():
            samples_per_row = max(1, num_needed // len(class_data))
            augmented_texts = augment_text_with_synonyms(row['CO Description'], n=samples_per_row)

            for text in augmented_texts:
                new_row = row.copy()
                new_row['CO Description'] = text
                augmented_samples.append(new_row)

            num_needed -= len(augmented_texts)
            if num_needed <= 0:
                break

    if num_needed > 0:
        for _, row in class_data.iterrows():
            samples_per_row = max(1, num_needed // len(class_data))
            augmented_texts = augment_text_with_synonyms(row['PO/PSO Description'], n=samples_per_row)

            for text in augmented_texts:
                new_row = row.copy()
                new_row['PO/PSO Description'] = text
                augmented_samples.append(new_row)

            num_needed -= len(augmented_texts)
            if num_needed <= 0:
                break

    augmented_df = pd.DataFrame(augmented_samples[:target_samples - len(class_data)])
    balanced_class_data = pd.concat([class_data, augmented_df])

    balanced_data = pd.concat([balanced_data, balanced_class_data])

balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

print(balanced_data['Score (0-3)'].value_counts())

balanced_data.to_csv('/content/balanced_copopsofinaldataset_new1.csv', index=False)

Score (0-3)
0    2500
1    2158
3    2151
2    2140
Name: count, dtype: int64


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
#Augmentation (Without Shuffling).
import pandas as pd
import nlpaug.augmenter.word as naw
import random

data = pd.read_csv('/content/copopsofinaldataset.csv')

aug = naw.SynonymAug(aug_p=0.3)

def augment_text_with_synonyms(text, n=1):
    augmented_texts = []
    for _ in range(n):
        augmented_text = aug.augment(text)
        augmented_texts.append(augmented_text[0])
    return augmented_texts

balanced_data = pd.DataFrame()

target_samples = 2500

for score in range(4):
    class_data = data[data['Score (0-3)'] == score]
    num_needed = target_samples - len(class_data)

    augmented_samples = []
    if num_needed > 0:
        for _, row in class_data.iterrows():
            samples_per_row = max(1, num_needed // len(class_data))
            augmented_texts = augment_text_with_synonyms(row['CO Description'], n=samples_per_row)

            for text in augmented_texts:
                new_row = row.copy()
                new_row['CO Description'] = text
                augmented_samples.append(new_row)

            num_needed -= len(augmented_texts)
            if num_needed <= 0:
                break

    if num_needed > 0:
        for _, row in class_data.iterrows():
            samples_per_row = max(1, num_needed // len(class_data))
            augmented_texts = augment_text_with_synonyms(row['PO/PSO Description'], n=samples_per_row)

            for text in augmented_texts:
                new_row = row.copy()
                new_row['PO/PSO Description'] = text
                augmented_samples.append(new_row)

            num_needed -= len(augmented_texts)
            if num_needed <= 0:
                break

    augmented_df = pd.DataFrame(augmented_samples[:target_samples - len(class_data)])
    balanced_class_data = pd.concat([class_data, augmented_df])

    balanced_data = pd.concat([balanced_data, balanced_class_data])

# No shuffling, keeping the original order intact

print(balanced_data['Score (0-3)'].value_counts())

balanced_data.to_csv('/content/balanced_copopsofinaldataset_new1_withoutshuffling.csv', index=False)

Score (0-3)
0    2500
1    2158
3    2151
2    2140
Name: count, dtype: int64
