In [1]:
import pandas as pd
import xlsxwriter
import numpy as np

import nlpaug.augmenter.word.context_word_embs as aug

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


from tqdm.auto import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#read the data into a pandas dataframe
df = pd.read_csv("bert_data_withoutClassWord.csv")
print(df.shape)
df.head(5)

(7115, 3)


Unnamed: 0.1,Unnamed: 0,text,class
0,0,this article is about the herbivorous mammals....,Antelope
1,1,"one new world species, the pronghorn of north ...",Antelope
2,2,"the english word ""animal"" first appeared in 14...",Antelope
3,3,"the word talopus and calopus, from latin, came...",Antelope
4,4,animal are not a cladistic or taxonomically de...,Antelope


In [3]:
#Add the new column which gives a unique number to each of these labels 

j = 0
for i in df['class'].unique():
    df.loc[df['class'] == i, ['class_num']] = j
    j += 1

#checking the results 
df.head(50000)

Unnamed: 0.1,Unnamed: 0,text,class,class_num
0,0,this article is about the herbivorous mammals....,Antelope,0.0
1,1,"one new world species, the pronghorn of north ...",Antelope,0.0
2,2,"the english word ""animal"" first appeared in 14...",Antelope,0.0
3,3,"the word talopus and calopus, from latin, came...",Antelope,0.0
4,4,animal are not a cladistic or taxonomically de...,Antelope,0.0
...,...,...,...,...
7110,7110,technology to use sponges as mouth protection ...,dolphin,49.0
7111,7111,"pesticides, heavy metals, plastics, and other ...",dolphin,49.0
7112,7112,"hundreds of orcas, animals and other members o...",dolphin,49.0
7113,7113,captured orcas and animals are confined to tan...,dolphin,49.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.text, 
    df.class_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.class_num
)

In [5]:
embbeding_size = 8000
clustering_count = 3
numof_embb = 2
att_size = embbeding_size * clustering_count

In [6]:
vectorizer = TfidfVectorizer(max_features = embbeding_size).fit(X_train)
        
X_train_vectorized = vectorizer.transform(X_train)
    
clf = RandomForestClassifier()
clf.fit(X_train_vectorized, y_train)
    
predictions = clf.predict(vectorizer.transform(X_test))

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.29      0.39      0.34        38
         1.0       0.40      0.26      0.32        23
         2.0       0.70      0.57      0.63        28
         3.0       0.45      0.54      0.49        37
         4.0       0.11      0.04      0.06        23
         5.0       0.27      0.46      0.34        37
         6.0       0.39      0.61      0.47        44
         7.0       0.36      0.61      0.45        31
         8.0       0.45      0.43      0.44        23
         9.0       0.35      0.28      0.31        25
        10.0       0.45      0.24      0.31        21
        11.0       0.33      0.20      0.25        20
        12.0       0.39      0.36      0.38        36
        13.0       0.68      0.66      0.67        29
        14.0       0.56      0.34      0.43        29
        15.0       0.52      0.56      0.54        27
        16.0       0.33      0.08      0.13        12
        17.0       0.31    

In [7]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

In [8]:
df['class_num'].value_counts()


23.0    224
6.0     219
29.0    204
31.0    196
18.0    193
0.0     190
28.0    185
42.0    184
5.0     184
3.0     182
12.0    180
44.0    176
38.0    165
49.0    165
39.0    164
19.0    162
33.0    156
22.0    155
7.0     154
37.0    154
14.0    145
13.0    143
40.0    143
30.0    142
48.0    141
2.0     140
47.0    139
15.0    136
24.0    131
36.0    130
32.0    128
45.0    127
46.0    124
9.0     123
35.0    122
27.0    118
8.0     117
1.0     117
4.0     116
43.0    112
25.0    110
26.0    107
17.0    107
21.0    107
10.0    106
41.0    101
11.0     98
34.0     73
20.0     61
16.0     59
Name: class_num, dtype: int64

In [9]:
def augmentMyData(df, augmenter, class_number, repetitions=1, samples=200):
    augmented_texts = []
    # select only the minority class samples
    spam_df = df[df['class_num'] == class_number].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(spam_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(spam_df['text'].iloc[i])
            #print(augmented_text)
            #print(str(augmented_text)[2:-2])
            augmented_texts.append(str(augmented_text)[2:-2])
    
    data = {
        'class_num': class_number,
        'text': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

In [10]:
new_df = df[['text', 'class_num']]
new_df.head(5)

Unnamed: 0,text,class_num
0,this article is about the herbivorous mammals....,0.0
1,"one new world species, the pronghorn of north ...",0.0
2,"the english word ""animal"" first appeared in 14...",0.0
3,"the word talopus and calopus, from latin, came...",0.0
4,animal are not a cladistic or taxonomically de...,0.0


In [11]:
class_count = new_df['class_num'].unique().shape[0]
class_with_maxCount = new_df['class_num'].value_counts().max()

for i in range(class_count):
    
    samples = class_with_maxCount - new_df['class_num'].value_counts()[i]
    
    print(f'augmenting {samples} for class of {i}')
    new_df = augmentMyData(new_df, augmenter, class_number=i, samples=samples)

augmenting 34 for class of 0


100%|██████████████████████████████████████████████████████████████████████████████████| 34/34 [00:27<00:00,  1.23it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 107 for class of 1


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [01:28<00:00,  1.21it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 84 for class of 2


100%|██████████████████████████████████████████████████████████████████████████████████| 84/84 [01:09<00:00,  1.21it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 42 for class of 3


100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:34<00:00,  1.22it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 108 for class of 4


100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [01:29<00:00,  1.20it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 40 for class of 5


100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [00:35<00:00,  1.13it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 5 for class of 6


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.20it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 70 for class of 7


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [01:06<00:00,  1.05it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 107 for class of 8


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [01:37<00:00,  1.09it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 101 for class of 9


100%|████████████████████████████████████████████████████████████████████████████████| 101/101 [01:25<00:00,  1.18it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 118 for class of 10


100%|████████████████████████████████████████████████████████████████████████████████| 118/118 [01:43<00:00,  1.14it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 126 for class of 11


100%|████████████████████████████████████████████████████████████████████████████████| 126/126 [01:46<00:00,  1.18it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 44 for class of 12


100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:38<00:00,  1.15it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 81 for class of 13


100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [01:09<00:00,  1.16it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 79 for class of 14


100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [01:04<00:00,  1.22it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 88 for class of 15


100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [01:14<00:00,  1.18it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 165 for class of 16


100%|████████████████████████████████████████████████████████████████████████████████| 165/165 [02:19<00:00,  1.19it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 117 for class of 17


100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [01:36<00:00,  1.21it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 31 for class of 18


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [00:26<00:00,  1.17it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 62 for class of 19


100%|██████████████████████████████████████████████████████████████████████████████████| 62/62 [00:52<00:00,  1.19it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 163 for class of 20


100%|████████████████████████████████████████████████████████████████████████████████| 163/163 [02:17<00:00,  1.18it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 117 for class of 21


100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [01:37<00:00,  1.20it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 69 for class of 22


100%|██████████████████████████████████████████████████████████████████████████████████| 69/69 [01:01<00:00,  1.12it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 0 for class of 23


0it [00:00, ?it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 93 for class of 24


100%|██████████████████████████████████████████████████████████████████████████████████| 93/93 [01:25<00:00,  1.09it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 114 for class of 25


100%|████████████████████████████████████████████████████████████████████████████████| 114/114 [01:45<00:00,  1.08it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 117 for class of 26


100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [01:50<00:00,  1.06it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 106 for class of 27


100%|████████████████████████████████████████████████████████████████████████████████| 106/106 [01:36<00:00,  1.09it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 39 for class of 28


100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [00:38<00:00,  1.01it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 20 for class of 29


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.02s/it]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 82 for class of 30


100%|██████████████████████████████████████████████████████████████████████████████████| 82/82 [01:12<00:00,  1.12it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 28 for class of 31


100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:26<00:00,  1.07it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 96 for class of 32


100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [01:20<00:00,  1.20it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 68 for class of 33


100%|██████████████████████████████████████████████████████████████████████████████████| 68/68 [00:57<00:00,  1.19it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 151 for class of 34


100%|████████████████████████████████████████████████████████████████████████████████| 151/151 [02:05<00:00,  1.20it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 102 for class of 35


100%|████████████████████████████████████████████████████████████████████████████████| 102/102 [01:22<00:00,  1.24it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 94 for class of 36


100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [01:16<00:00,  1.23it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 70 for class of 37


100%|██████████████████████████████████████████████████████████████████████████████████| 70/70 [00:59<00:00,  1.18it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 59 for class of 38


100%|██████████████████████████████████████████████████████████████████████████████████| 59/59 [00:52<00:00,  1.12it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 60 for class of 39


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:51<00:00,  1.18it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 81 for class of 40


100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [01:16<00:00,  1.06it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 123 for class of 41


100%|████████████████████████████████████████████████████████████████████████████████| 123/123 [01:47<00:00,  1.14it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 40 for class of 42


100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [00:36<00:00,  1.09it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 112 for class of 43


100%|████████████████████████████████████████████████████████████████████████████████| 112/112 [01:49<00:00,  1.02it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 48 for class of 44


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:52<00:00,  1.10s/it]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 97 for class of 45


100%|██████████████████████████████████████████████████████████████████████████████████| 97/97 [01:43<00:00,  1.07s/it]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 100 for class of 46


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:31<00:00,  1.09it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 85 for class of 47


100%|██████████████████████████████████████████████████████████████████████████████████| 85/85 [01:16<00:00,  1.11it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 83 for class of 48


100%|██████████████████████████████████████████████████████████████████████████████████| 83/83 [01:16<00:00,  1.08it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


augmenting 59 for class of 49


100%|██████████████████████████████████████████████████████████████████████████████████| 59/59 [00:52<00:00,  1.12it/s]
  df = shuffle(df.append(aug_df).reset_index(drop=True))


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    new_df.text, 
    new_df.class_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=new_df.class_num
)


embbeding_size = 8000
clustering_count = 3
numof_embb = 2
att_size = embbeding_size * clustering_count



vectorizer = TfidfVectorizer(max_features = embbeding_size)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_train_vectorized = X_train_vectorized.toarray()
            
clf = RandomForestClassifier()
clf.fit(X_train_vectorized, y_train)
    
predictions = clf.predict(vectorizer.transform(X_test))

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.64      0.62      0.63        45
         1.0       0.75      0.84      0.79        45
         2.0       0.88      0.82      0.85        45
         3.0       0.55      0.66      0.60        44
         4.0       0.97      0.62      0.76        45
         5.0       0.55      0.73      0.63        45
         6.0       0.48      0.48      0.48        44
         7.0       0.56      0.86      0.68        44
         8.0       0.78      0.84      0.81        45
         9.0       0.76      0.77      0.76        44
        10.0       0.95      0.84      0.89        45
        11.0       0.89      0.91      0.90        45
        12.0       0.79      0.51      0.62        45
        13.0       0.97      0.82      0.89        45
        14.0       0.80      0.62      0.70        45
        15.0       0.69      0.82      0.75        45
        16.0       0.80      0.98      0.88        45
        17.0       0.85    

In [19]:
type(y_train)
y_train
t

Unnamed: 0,text,class_num
1888,animales tend typically to have an average lit...,21.0
2347,tests recorded using immature mice ( notably t...,43.0
3853,animal were one of the first and most successf...,22.0
10084,suppose a typical animal has a large head shap...,41.0
7176,according mostly to those journalists who also...,45.0
...,...,...
215,animals communicate through more than just cli...,49.0
1140,there are many names for gray wolves besides t...,31.0
9233,animals are in a group of breeds classed as ‘c...,7.0
1601,these pups are born hairless and blind in a ne...,11.0


In [None]:
# split the dataset before augmenting to avoid augmented data in valid set
X_train, _, y_train, _ = train_test_split(aug_df['text'], aug_df['class_num'].values, test_size=0.2)
_, X_test, _, y_test = train_test_split(df['text'], df['class_num'].values, test_size=0.5)
