In [60]:
# to find if you can use GPU for augment data
import torch
torch.cuda.is_available()
## cuda installation path: C:\Users\Admin\AppData\Local\Temp\CUDA

True

In [1]:
# import all libraries
import re
import string
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# this is the main library that we use to augment our text data
import nlpaug.augmenter.word.context_word_embs as aug

augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", device='cuda')

  from .autonotebook import tqdm as notebook_tqdm


In [62]:
# upload data to environment
# this data is generated from the scraper.ipynb file
all_data_original = pd.read_pickle('all_data_transformed_simplified.pkl')
all_data_original.rename(columns=lambda x: x.strip(),inplace=True)
all_data_original.head()

## Steps to augment texts data:
1. Split the data into train and test/validation
    * We need to split the data before augmenting them. This ensures that the model used during the test/validation stage has not yet seen the data.
    * We split the data to train versus test by 80:20 for both class 0 and class 1
2. We then augment ONLY the class 1 to match the quantity of class 0 data
    * We augment the data up to 1,500
    * Except GoodHealth. We only augment 700 new data for GoodHealth. This is because GoodHealth already has a high number of actual class 1 data.

In [75]:
# create a custom function to augment the data specifically for NoPoverty
def augmentNoPoverty(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['NoPoverty'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'NoPoverty': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df


# split the data into train vs test
NoPoverty_df = all_data_original.drop(['ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
NoPoverty_train, NoPoverty_test = train_test_split(NoPoverty_df, 
                                                   random_state=10, 
                                                   test_size=0.20,
                                                   stratify=NoPoverty_df['NoPoverty'])

# run the data augmenter 
NoPoverty__train_aug = augmentNoPoverty(NoPoverty_train, augmenter, samples=1500)
NoPoverty__train_aug.head()

 44%|████▎     | 654/1500 [01:27<01:53,  7.45it/s]


KeyboardInterrupt: 

In [None]:
# save the augmented data into train file, and the test data into test file
NoPoverty__train_aug.to_csv('NoPoverty_train_aug_simplified.csv')
NoPoverty__train_aug.to_pickle('NoPoverty_train_aug_simplified.pkl')
NoPoverty_test.to_csv('NoPoverty_test_simplified.csv')
NoPoverty_test.to_pickle('NoPoverty_test_simplified.pkl')
print(NoPoverty_test['NoPoverty'].value_counts())
print(NoPoverty__train_aug['NoPoverty'].value_counts())

0    394
1      5
Name: NoPoverty, dtype: int64
0    1574
1    1520
Name: NoPoverty, dtype: int64


In [None]:
# create a custom function to augment the data specifically for ZeroHunger
def augmentZeroHunger(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['ZeroHunger'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'ZeroHunger': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

ZeroHunger_df = all_data_original.drop(['NoPoverty','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
ZeroHunger_train, ZeroHunger_test = train_test_split(ZeroHunger_df, 
                                                   random_state=11, 
                                                   test_size=0.20,
                                                   stratify=ZeroHunger_df['ZeroHunger'])
ZeroHunger_train_aug = augmentZeroHunger(ZeroHunger_train, augmenter, samples=1500)
ZeroHunger_train_aug.head()

100%|██████████| 1500/1500 [04:47<00:00,  5.21it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,ZeroHunger,transcripts_simplified
1434,0,gender gender heritage heritage voice land cul...
1822,1,[pay medical medical healthcare pay medical me...
1088,0,pay medical medical healthcare pay medical med...
1344,0,decision secondary decision health decision co...
474,0,education natural primary work work cultural n...


In [None]:
ZeroHunger_train_aug.to_csv('ZeroHunger_train_aug_simplified.csv')
ZeroHunger_train_aug.to_pickle('ZeroHunger_train_aug_simplified.pkl')
ZeroHunger_test.to_csv('ZeroHunger_test_simplified.csv')
ZeroHunger_test.to_pickle('ZeroHunger_test_simplified.pkl')
print(ZeroHunger_test['ZeroHunger'].value_counts())
print(ZeroHunger_train_aug['ZeroHunger'].value_counts())

0    392
1      7
Name: ZeroHunger, dtype: int64
0    1565
1    1529
Name: ZeroHunger, dtype: int64


In [None]:
# create a custom function to augment the data specifically for GoodHealth
def augmentGoodHealth(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['GoodHealth'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'GoodHealth': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

GoodHealth_df = all_data_original.drop(['NoPoverty','ZeroHunger','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
GoodHealth_train, GoodHealth_test = train_test_split(GoodHealth_df, 
                                                   random_state=12, 
                                                   test_size=0.20,
                                                   stratify=GoodHealth_df['GoodHealth'])

# notice we uses sample=700 instead of 1500 for GoodHealth here
GoodHealth_train_aug = augmentGoodHealth(GoodHealth_train, augmenter, samples=700)
GoodHealth_train_aug.head()

100%|██████████| 700/700 [02:06<00:00,  5.52it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,GoodHealth,transcripts_simplified
1248,1,coronavirus covid resource resource coronaviru...
178,1,coronavirus covid resource resource coronaviru...
832,0,entrepreneurship entrepreneurship leadership e...
1725,1,[resource farm covid covid medicine education ...
617,0,industry autonomy economic network security so...


In [None]:
GoodHealth_train_aug.to_csv('GoodHealth_train_aug_simplified.csv')
GoodHealth_train_aug.to_pickle('GoodHealth_train_aug_simplified.pkl')
GoodHealth_test.to_csv('GoodHealth_test_simplified.csv')
GoodHealth_test.to_pickle('GoodHealth_test_simplified.pkl')
print(GoodHealth_test['GoodHealth'].value_counts())
print(GoodHealth_train_aug['GoodHealth'].value_counts())

0    212
1    187
Name: GoodHealth, dtype: int64
1    1445
0     849
Name: GoodHealth, dtype: int64


In [None]:
# create a custom function to augment the data specifically for QualityEducation
def augmentQualityEducation(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['QualityEducation'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'QualityEducation': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

QualityEducation_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
QualityEducation_train, QualityEducation_test = train_test_split(QualityEducation_df, 
                                                   random_state=13, 
                                                   test_size=0.20,
                                                   stratify=QualityEducation_df['QualityEducation'])
QualityEducation_train_aug = augmentQualityEducation(QualityEducation_train, augmenter, samples=1500)
QualityEducation_train_aug.head()

100%|██████████| 1500/1500 [03:55<00:00,  6.37it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,QualityEducation,transcripts_simplified
1040,0,health health health health health health heal...
1054,0,coronavirus covid resource resource coronaviru...
1145,0,coronavirus covid resource resource coronaviru...
1305,0,health health health health health health heal...
2149,1,[law applied law law law education skill educa...


In [None]:
QualityEducation_train_aug.to_csv('QualityEducation_train_aug_simplified.csv')
QualityEducation_train_aug.to_pickle('QualityEducation_train_aug_simplified.pkl')
QualityEducation_test.to_csv('QualityEducation_test_simplified.csv')
QualityEducation_test.to_pickle('QualityEducation_test_simplified.pkl')
print(QualityEducation_test['QualityEducation'].value_counts())
print(QualityEducation_train_aug['QualityEducation'].value_counts())

0    370
1     29
Name: QualityEducation, dtype: int64
1    1615
0    1479
Name: QualityEducation, dtype: int64


In [None]:
# create a custom function to augment the data specifically for GenderEquality
def augmentGenderEquality(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['GenderEquality'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'GenderEquality': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

GenderEquality_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
GenderEquality_train, GenderEquality_test = train_test_split(GenderEquality_df, 
                                                   random_state=14, 
                                                   test_size=0.20,
                                                   stratify=GenderEquality_df['GenderEquality'])
GenderEquality_train_aug = augmentGenderEquality(GenderEquality_train, augmenter, samples=1500)
GenderEquality_train_aug.head()

100%|██████████| 1500/1500 [04:29<00:00,  5.56it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,GenderEquality,transcripts_simplified
1489,0,work education work education medicine medicin...
442,0,financial leadership education inclusion resou...
772,0,city resource primary city medicine medicine p...
1168,0,medicine education medical medical medicine ch...
636,0,resource farm covid covid medicine medicine ed...


In [None]:
GenderEquality_train_aug.to_csv('GenderEquality_train_aug_simplified.csv')
GenderEquality_train_aug.to_pickle('GenderEquality_train_aug_simplified.pkl')
GenderEquality_test.to_csv('GenderEquality_test_simplified.csv')
GenderEquality_test.to_pickle('GenderEquality_test_simplified.pkl')
print(GenderEquality_test['GenderEquality'].value_counts())
print(GenderEquality_train_aug['GenderEquality'].value_counts())

0    378
1     21
Name: GenderEquality, dtype: int64
1    1582
0    1512
Name: GenderEquality, dtype: int64


In [None]:
# create a custom function to augment the data specifically for CleanWater
def augmentCleanWater(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['CleanWater'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'CleanWater': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

CleanWater_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
CleanWater_train, CleanWater_test = train_test_split(CleanWater_df, 
                                                   random_state=15, 
                                                   test_size=0.20,
                                                   stratify=CleanWater_df['CleanWater'])
CleanWater_train_aug = augmentCleanWater(CleanWater_train, augmenter, samples=1500)
CleanWater_train_aug.head()

100%|██████████| 1500/1500 [02:14<00:00, 11.13it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,CleanWater,transcripts_simplified
1947,1,[resource work work work physical chemical phy...
2922,1,[voice building education education energy con...
1677,1,[resource work work work physical chemical phy...
2629,1,[covid justice city preservation urban energy ...
477,0,work leadership welfare work partnership welfa...


In [None]:
CleanWater_train_aug.to_csv('CleanWater_train_aug_simplified.csv')
CleanWater_train_aug.to_pickle('CleanWater_train_aug_simplified.pkl')
CleanWater_test.to_csv('CleanWater_test_simplified.csv')
CleanWater_test.to_pickle('CleanWater_test_simplified.pkl')
print(CleanWater_test['CleanWater'].value_counts())
print(CleanWater_train_aug['CleanWater'].value_counts())

0    396
1      3
Name: CleanWater, dtype: int64
0    1584
1    1510
Name: CleanWater, dtype: int64


In [None]:
# create a custom function to augment the data specifically for AffordableCleanEnergy
def augmentAffordableCleanEnergy(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['AffordableCleanEnergy'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'AffordableCleanEnergy': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

AffordableCleanEnergy_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
AffordableCleanEnergy_train, AffordableCleanEnergy_test = train_test_split(AffordableCleanEnergy_df, 
                                                   random_state=17, 
                                                   test_size=0.20,
                                                   stratify=AffordableCleanEnergy_df['AffordableCleanEnergy'])
AffordableCleanEnergy_train_aug = augmentAffordableCleanEnergy(AffordableCleanEnergy_train, augmenter, samples=1500)
AffordableCleanEnergy_train_aug.head()

100%|██████████| 1500/1500 [02:38<00:00,  9.48it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,AffordableCleanEnergy,transcripts_simplified
2461,1,[covid justice city preservation urban energy ...
2648,1,[building building building labor ecology ener...
351,0,coronavirus covid resource resource coronaviru...
3019,1,[covid justice city preservation urban energy ...
1860,1,[law law law law education skill education law...


In [None]:
AffordableCleanEnergy_train_aug.to_csv('AffordableCleanEnergy_train_aug_simplified.csv')
AffordableCleanEnergy_train_aug.to_pickle('AffordableCleanEnergy_train_aug_simplified.pkl')
AffordableCleanEnergy_test.to_csv('AffordableCleanEnergy_test_simplified.csv')
AffordableCleanEnergy_test.to_pickle('AffordableCleanEnergy_test_simplified.pkl')
print(AffordableCleanEnergy_test['AffordableCleanEnergy'].value_counts())
print(AffordableCleanEnergy_train_aug['AffordableCleanEnergy'].value_counts())

0    392
1      7
Name: AffordableCleanEnergy, dtype: int64
0    1568
1    1526
Name: AffordableCleanEnergy, dtype: int64


In [None]:
# create a custom function to augment the data specifically for DecentWork
def augmentDecentWork(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['DecentWork'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'DecentWork': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

DecentWork_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
DecentWork_train, DecentWork_test = train_test_split(DecentWork_df, 
                                                   random_state=18, 
                                                   test_size=0.20,
                                                   stratify=DecentWork_df['DecentWork'])
DecentWork_train_aug = augmentDecentWork(DecentWork_train, augmenter, samples=1500)
DecentWork_train_aug.head()

100%|██████████| 1500/1500 [03:21<00:00,  7.44it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,DecentWork,transcripts_simplified
379,0,coronavirus covid resource resource coronaviru...
1950,1,[secondary job job resource job justice health...
3052,1,[education education risk educational educatio...
2307,1,[youth entrepreneurship entrepreneurship job p...
848,0,medicine medicine medicine medicine medicine m...


In [None]:
DecentWork_train_aug.to_csv('DecentWork_train_aug_simplified.csv')
DecentWork_train_aug.to_pickle('DecentWork_train_aug_simplified.pkl')
DecentWork_test.to_csv('DecentWork_test_simplified.csv')
DecentWork_test.to_pickle('DecentWork_test_simplified.pkl')
print(DecentWork_test['DecentWork'].value_counts())
print(DecentWork_train_aug['DecentWork'].value_counts())

0    374
1     25
Name: DecentWork, dtype: int64
1    1598
0    1496
Name: DecentWork, dtype: int64


In [None]:
# create a custom function to augment the data specifically for IndustryInnovation
def augmentIndustryInnovation(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['IndustryInnovation'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'IndustryInnovation': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

IndustryInnovation_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
IndustryInnovation_train, IndustryInnovation_test = train_test_split(IndustryInnovation_df, 
                                                   random_state=19, 
                                                   test_size=0.20,
                                                   stratify=IndustryInnovation_df['IndustryInnovation'])
IndustryInnovation_train_aug = augmentIndustryInnovation(IndustryInnovation_train, augmenter, samples=1500)
IndustryInnovation_train_aug.head()

100%|██████████| 1500/1500 [04:10<00:00,  6.00it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,IndustryInnovation,transcripts_simplified
644,0,coronavirus covid resource resource coronaviru...
2545,1,[secondary network internet access network acc...
1669,1,[work work building work hiv violence hiv crim...
775,0,resource farm covid covid medicine medicine me...
2138,1,[education work leader entrepreneurship enterp...


In [None]:
IndustryInnovation_train_aug.to_csv('IndustryInnovation_train_aug_simplified.csv')
IndustryInnovation_train_aug.to_pickle('IndustryInnovation_train_aug_simplified.pkl')
IndustryInnovation_test.to_csv('IndustryInnovation_test_simplified.csv')
IndustryInnovation_test.to_pickle('IndustryInnovation_test_simplified.pkl')
print(IndustryInnovation_test['IndustryInnovation'].value_counts())
print(IndustryInnovation_train_aug['IndustryInnovation'].value_counts())

0    355
1     44
Name: IndustryInnovation, dtype: int64
1    1676
0    1418
Name: IndustryInnovation, dtype: int64


In [None]:
# create a custom function to augment the data specifically for ReduceInequality
def augmentReduceInequality(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['ReduceInequality'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'ReduceInequality': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

ReduceInequality_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
ReduceInequality_train, ReduceInequality_test = train_test_split(ReduceInequality_df, 
                                                   random_state=20, 
                                                   test_size=0.20,
                                                   stratify=ReduceInequality_df['ReduceInequality'])
ReduceInequality_train_aug = augmentReduceInequality(ReduceInequality_train, augmenter, samples=1500)
ReduceInequality_train_aug.head()

100%|██████████| 1500/1500 [03:57<00:00,  6.30it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,ReduceInequality,transcripts_simplified
1951,1,[covid justice city preservation urban energy ...
637,0,coronavirus covid resource resource coronaviru...
2341,1,[entrepreneurship network labor work work ineq...
2350,1,[crisis asset abuse asset abuse asset losses a...
214,0,chemical chemical education young chemical nat...


In [None]:
ReduceInequality_train_aug.to_csv('ReduceInequality_train_aug_simplified.csv')
ReduceInequality_train_aug.to_pickle('ReduceInequality_train_aug_simplified.pkl')
ReduceInequality_test.to_csv('ReduceInequality_test_simplified.csv')
ReduceInequality_test.to_pickle('ReduceInequality_test_simplified.pkl')
print(ReduceInequality_test['ReduceInequality'].value_counts())
print(ReduceInequality_train_aug['ReduceInequality'].value_counts())

0    353
1     46
Name: ReduceInequality, dtype: int64
1    1682
0    1412
Name: ReduceInequality, dtype: int64


In [None]:
# create a custom function to augment the data specifically for SustainableCities
def augmentSustainableCities(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['SustainableCities'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'SustainableCities': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

SustainableCities_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
SustainableCities_train, SustainableCities_test = train_test_split(SustainableCities_df, 
                                                   random_state=21, 
                                                   test_size=0.20,
                                                   stratify=SustainableCities_df['SustainableCities'])
SustainableCities_train_aug = augmentSustainableCities(SustainableCities_train, augmenter, samples=1500)
SustainableCities_train_aug.head()

100%|██████████| 1500/1500 [03:34<00:00,  6.99it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,SustainableCities,transcripts_simplified
2612,1,[covid justice city preservation urban energy ...
1402,0,physical education energy resource physical ed...
2935,1,[education financial financial resource covid ...
2298,1,[education financial financial resource covid ...
2597,1,[education city urban credit network commodity...


In [None]:
SustainableCities_train_aug.to_csv('SustainableCities_train_aug_simplified.csv')
SustainableCities_train_aug.to_pickle('SustainableCities_train_aug_simplified.pkl')
SustainableCities_test.to_csv('SustainableCities_test_simplified.csv')
SustainableCities_test.to_pickle('SustainableCities_test_simplified.pkl')
print(SustainableCities_test['SustainableCities'].value_counts())
print(SustainableCities_train_aug['SustainableCities'].value_counts())

0    387
1     12
Name: SustainableCities, dtype: int64
1    1549
0    1545
Name: SustainableCities, dtype: int64


In [None]:
# create a custom function to augment the data specifically for ResponsibleConsumptionProduction
def augmentResponsibleConsumptionProduction(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['ResponsibleConsumptionProduction'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'ResponsibleConsumptionProduction': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

ResponsibleConsumptionProduction_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
ResponsibleConsumptionProduction_train, ResponsibleConsumptionProduction_test = train_test_split(ResponsibleConsumptionProduction_df, 
                                                   random_state=22, 
                                                   test_size=0.20,
                                                   stratify=ResponsibleConsumptionProduction_df['ResponsibleConsumptionProduction'])
ResponsibleConsumptionProduction_train_aug = augmentResponsibleConsumptionProduction(ResponsibleConsumptionProduction_train, augmenter, samples=1500)
ResponsibleConsumptionProduction_train_aug.head()

100%|██████████| 1500/1500 [04:03<00:00,  6.16it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,ResponsibleConsumptionProduction,transcripts_simplified
1940,1,[abuse abuse abuse abusive abuse use abuse]
726,0,inequality work culture culture culture work i...
711,0,resource education resource resource resource ...
3079,1,[youth decision decision decision financial ed...
1233,0,coronavirus covid resource resource coronaviru...


In [None]:
ResponsibleConsumptionProduction_train_aug.to_csv('ResponsibleConsumptionProduction_train_aug_simplified.csv')
ResponsibleConsumptionProduction_train_aug.to_pickle('ResponsibleConsumptionProduction_train_aug_simplified.pkl')
ResponsibleConsumptionProduction_test.to_csv('ResponsibleConsumptionProduction_test_simplified.csv')
ResponsibleConsumptionProduction_test.to_pickle('ResponsibleConsumptionProduction_test_simplified.pkl')
print(ResponsibleConsumptionProduction_test['ResponsibleConsumptionProduction'].value_counts())
print(ResponsibleConsumptionProduction_train_aug['ResponsibleConsumptionProduction'].value_counts())

0    391
1      8
Name: ResponsibleConsumptionProduction, dtype: int64
0    1562
1    1532
Name: ResponsibleConsumptionProduction, dtype: int64


In [None]:
# create a custom function to augment the data specifically for ClimateAction
def augmentClimateAction(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['ClimateAction'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'ClimateAction': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

ClimateAction_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','LifeBelowWater','LifeonLand','PeaceJustice','Partnerships'], axis=1)
ClimateAction_train, ClimateAction_test = train_test_split(ClimateAction_df, 
                                                   random_state=23, 
                                                   test_size=0.20,
                                                   stratify=ClimateAction_df['ClimateAction'])
ClimateAction_train_aug = augmentClimateAction(ClimateAction_train, augmenter, samples=1500)
ClimateAction_train_aug.head()

100%|██████████| 1500/1500 [03:06<00:00,  8.05it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,ClimateAction,transcripts_simplified
610,0,resource natural natural civil fundamental lev...
298,0,medicine education medical medical partnership...
1766,1,[digital secondary hybrid green building solut...
587,0,education financial financial resource covid l...
170,0,medicine education medical medical medicine bu...


In [None]:
ClimateAction_train_aug.to_csv('ClimateAction_train_aug_simplified.csv')
ClimateAction_train_aug.to_pickle('ClimateAction_train_aug_simplified.pkl')
ClimateAction_test.to_csv('ClimateAction_test_simplified.csv')
ClimateAction_test.to_pickle('ClimateAction_test_simplified.pkl')
print(ClimateAction_test['ClimateAction'].value_counts())
print(ClimateAction_train_aug['ClimateAction'].value_counts())

0    392
1      7
Name: ClimateAction, dtype: int64
0    1566
1    1528
Name: ClimateAction, dtype: int64


In [None]:
# create a custom function to augment the data specifically for LifeBelowWater
def augmentLifeBelowWater(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['LifeBelowWater'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'LifeBelowWater': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

LifeBelowWater_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeonLand','PeaceJustice','Partnerships'], axis=1)
LifeBelowWater_train, LifeBelowWater_test = train_test_split(LifeBelowWater_df, 
                                                   random_state=24, 
                                                   test_size=0.20,
                                                   stratify=LifeBelowWater_df['LifeBelowWater'])
LifeBelowWater_train_aug = augmentLifeBelowWater(LifeBelowWater_train, augmenter, samples=1500)
LifeBelowWater_train_aug.head()

100%|██████████| 1500/1500 [04:35<00:00,  5.45it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,LifeBelowWater,transcripts_simplified
686,0,coronavirus covid resource resource coronaviru...
2746,1,[ecology biodiversity plant genetics education...
215,0,medicine education medical medical medicine he...
2520,1,[covid justice city preservation urban energy ...
1296,0,coronavirus covid resource resource coronaviru...


In [None]:
LifeBelowWater_train_aug.to_csv('LifeBelowWater_train_aug_simplified.csv')
LifeBelowWater_train_aug.to_pickle('LifeBelowWater_train_aug_simplified.pkl')
LifeBelowWater_test.to_csv('LifeBelowWater_test_simplified.csv')
LifeBelowWater_test.to_pickle('LifeBelowWater_test_simplified.pkl')
print(LifeBelowWater_test['LifeBelowWater'].value_counts())
print(LifeBelowWater_train_aug['LifeBelowWater'].value_counts())

0    397
1      2
Name: LifeBelowWater, dtype: int64
0    1587
1    1507
Name: LifeBelowWater, dtype: int64


In [None]:
# create a custom function to augment the data specifically for LifeonLand
def augmentLifeonLand(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['LifeonLand'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'LifeonLand': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

LifeonLand_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','PeaceJustice','Partnerships'], axis=1)
LifeonLand_train, LifeonLand_test = train_test_split(LifeonLand_df, 
                                                   random_state=25, 
                                                   test_size=0.20,
                                                   stratify=LifeonLand_df['LifeonLand'])
LifeonLand_train_aug = augmentLifeonLand(LifeonLand_train, augmenter, samples=1500)
LifeonLand_train_aug.head()

100%|██████████| 1500/1500 [04:22<00:00,  5.72it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,LifeonLand,transcripts_simplified
2523,1,[youth decision primary access level biomedica...
53,0,covid justice city preservation urban energy u...
2786,1,[resource farm covid covid wildlife wildlife m...
567,0,energy building education resource energy buil...
1397,0,law law law law education skill education law ...


In [None]:
LifeonLand_train_aug.to_csv('LifeonLand_train_aug_simplified.csv')
LifeonLand_train_aug.to_pickle('LifeonLand_train_aug_simplified.pkl')
LifeonLand_test.to_csv('LifeonLand_test_simplified.csv')
LifeonLand_test.to_pickle('LifeonLand_test_simplified.pkl')
print(LifeonLand_test['LifeonLand'].value_counts())
print(LifeonLand_train_aug['LifeonLand'].value_counts())

0    395
1      4
Name: LifeonLand, dtype: int64
0    1577
1    1517
Name: LifeonLand, dtype: int64


In [None]:
# create a custom function to augment the data specifically for PeaceJustice
def augmentPeaceJustice(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['PeaceJustice'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'PeaceJustice': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

PeaceJustice_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','Partnerships'], axis=1)
PeaceJustice_train, PeaceJustice_test = train_test_split(PeaceJustice_df, 
                                                   random_state=26, 
                                                   test_size=0.20,
                                                   stratify=PeaceJustice_df['PeaceJustice'])
PeaceJustice_train_aug = augmentPeaceJustice(PeaceJustice_train, augmenter, samples=1500)
PeaceJustice_train_aug.head()

100%|██████████| 1500/1500 [04:22<00:00,  5.72it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,PeaceJustice,transcripts_simplified
2420,1,[population socioeconomic society inequality h...
923,0,youth credit primary pay work work work work w...
207,1,network inequality network health inequality n...
609,0,work network work network work network work ne...
2215,1,[law law law law education skill education law...


In [None]:
PeaceJustice_train_aug.to_csv('PeaceJustice_train_aug_simplified.csv')
PeaceJustice_train_aug.to_pickle('PeaceJustice_train_aug_simplified.pkl')
PeaceJustice_test.to_csv('PeaceJustice_test_simplified.csv')
PeaceJustice_test.to_pickle('PeaceJustice_test_simplified.pkl')
print(PeaceJustice_test['PeaceJustice'].value_counts())
print(PeaceJustice_train_aug['PeaceJustice'].value_counts())

0    375
1     24
Name: PeaceJustice, dtype: int64
1    1595
0    1499
Name: PeaceJustice, dtype: int64


In [None]:
# create a custom function to augment the data specifically for Partnerships
def augmentPartnerships(df, augmenter, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    positive_df = df[df['Partnerships'] == 1].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(positive_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(positive_df['transcripts_simplified'].loc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'Partnerships': 1,
        'transcripts_simplified': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

Partnerships_df = all_data_original.drop(['NoPoverty','ZeroHunger','GoodHealth','QualityEducation','GenderEquality','CleanWater','AffordableCleanEnergy','DecentWork','IndustryInnovation','ReduceInequality','SustainableCities','ResponsibleConsumptionProduction','ClimateAction','LifeBelowWater','LifeonLand','PeaceJustice'], axis=1)
Partnerships_train, Partnerships_test = train_test_split(Partnerships_df, 
                                                   random_state=26, 
                                                   test_size=0.20,
                                                   stratify=Partnerships_df['Partnerships'])
Partnerships_train_aug = augmentPartnerships(Partnerships_train, augmenter, samples=1500)
Partnerships_train_aug.head()

100%|██████████| 1500/1500 [03:48<00:00,  6.56it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


Unnamed: 0,Partnerships,transcripts_simplified
2756,1,[financial leadership education inclusion reso...
1084,1,law law law law education skill education law ...
947,0,youth entrepreneurship entrepreneurship job pr...
744,0,resource security
2990,1,[youth micro job micro job resource primary as...


In [None]:
Partnerships_train_aug.to_csv('Partnerships_train_aug_simplified.csv')
Partnerships_train_aug.to_pickle('Partnerships_train_aug_simplified.pkl')
Partnerships_test.to_csv('Partnerships_test_simplified.csv')
Partnerships_test.to_pickle('Partnerships_test_simplified.pkl')
print(Partnerships_test['Partnerships'].value_counts())
print(Partnerships_train_aug['Partnerships'].value_counts())

0    383
1     16
Name: Partnerships, dtype: int64
1    1564
0    1530
Name: Partnerships, dtype: int64
