# Data Augmentation
Experimenting different data augmentation techniques
* Synonym
* Embedding
* (Stacked) Synonym + Embedding
* (Stacked) Embedding + Synonym

## Install and import required libraries

In [None]:
!pip install nlpaug
!pip install textattack

In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action
from textattack.augmentation import EmbeddingAugmenter

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Original

In [199]:
df = pd.read_csv('/content/drive/MyDrive/IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [200]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [201]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [202]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [203]:
df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,1
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,1
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,1
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,1
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,1


In [204]:
df['manual_label'].value_counts()

1    1374
2     517
0     109
Name: manual_label, dtype: int64

In [205]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,1
1,mutant ape yacht club bought floor h chg floor...,1
2,mutant ape yacht club mayc nft sold eth k,1
3,mutant ape yacht club mayc nft sold eth k,1
4,mutant ape yacht club sold eth nft collection ...,1


In [206]:
df.isnull().values.any()

False

In [207]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['manual_label'], test_size=0.2,random_state=42)

In [208]:
y_train.value_counts()

1    1097
2     415
0      88
Name: manual_label, dtype: int64

In [209]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [210]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)

In [211]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(X_train_vectorized,y_train)
t1 = time.time()

y_pred=XGB_classifier.predict(X_test_vectorized)

time_linear_train = t1-t0

In [212]:
print("Training time: %fs" % (time_linear_train))

Training time: 1.675013s


In [213]:
print(accuracy_score(y_test, y_pred))

0.8325


In [214]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.43      0.51        21
           1       0.91      0.89      0.90       277
           2       0.68      0.76      0.72       102

    accuracy                           0.83       400
   macro avg       0.74      0.69      0.71       400
weighted avg       0.84      0.83      0.83       400



## Synonym

In [215]:
df = pd.read_csv('/content/drive/MyDrive/IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [216]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [217]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [218]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [219]:
df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,1
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,1
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,1
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,1
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,1


In [220]:
df['manual_label'].value_counts()

1    1374
2     517
0     109
Name: manual_label, dtype: int64

In [221]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,1
1,mutant ape yacht club bought floor h chg floor...,1
2,mutant ape yacht club mayc nft sold eth k,1
3,mutant ape yacht club mayc nft sold eth k,1
4,mutant ape yacht club sold eth nft collection ...,1


In [222]:
df.isnull().values.any()

False

In [223]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['manual_label'], test_size=0.2,random_state=42)

In [224]:
y_train.value_counts()

1    1097
2     415
0      88
Name: manual_label, dtype: int64

In [225]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [226]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=3)

In [227]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]==0:
    temps1=aug.augment(X_train[i],n=3)
    for sent in temps1:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append(0)

In [228]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(1864,)
(1864,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [229]:
y_train.value_counts()

1    1097
2     415
0     352
dtype: int64

In [230]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)

In [231]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(X_train_vectorized,y_train)
t1 = time.time()

y_pred=XGB_classifier.predict(X_test_vectorized)

time_linear_train = t1-t0

In [232]:
print("Training time: %fs" % (time_linear_train))

Training time: 2.069492s


In [233]:
print(accuracy_score(y_test, y_pred))

0.8125


In [234]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.43      0.47        21
           1       0.89      0.88      0.89       277
           2       0.66      0.71      0.68       102

    accuracy                           0.81       400
   macro avg       0.69      0.67      0.68       400
weighted avg       0.81      0.81      0.81       400



## Embedding

In [235]:
df = pd.read_csv('/content/drive/MyDrive/IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [236]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [237]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [238]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [239]:
df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,1
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,1
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,1
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,1
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,1


In [240]:
df['manual_label'].value_counts()

1    1374
2     517
0     109
Name: manual_label, dtype: int64

In [241]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,1
1,mutant ape yacht club bought floor h chg floor...,1
2,mutant ape yacht club mayc nft sold eth k,1
3,mutant ape yacht club mayc nft sold eth k,1
4,mutant ape yacht club sold eth nft collection ...,1


In [242]:
df.isnull().values.any()

False

In [243]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['manual_label'], test_size=0.2,random_state=42)

In [244]:
y_train.value_counts()

1    1097
2     415
0      88
Name: manual_label, dtype: int64

In [245]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [246]:
embed_aug = EmbeddingAugmenter()

In [247]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]==0:
    temps3=embed_aug.augment(X_train[i])
    for sent in temps3:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append(0)

In [248]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(1688,)
(1688,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [249]:
y_train.value_counts()

1    1097
2     415
0     176
dtype: int64

In [250]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)

In [251]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(X_train_vectorized,y_train)
t1 = time.time()

y_pred=XGB_classifier.predict(X_test_vectorized)

time_linear_train = t1-t0

In [252]:
print("Training time: %fs" % (time_linear_train))

Training time: 3.900810s


In [253]:
print(accuracy_score(y_test, y_pred))

0.8175


In [254]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.38      0.46        21
           1       0.89      0.88      0.89       277
           2       0.67      0.75      0.70       102

    accuracy                           0.82       400
   macro avg       0.71      0.67      0.68       400
weighted avg       0.82      0.82      0.82       400



## Synonym + Embedding

In [255]:
df = pd.read_csv('/content/drive/MyDrive/IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [256]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [257]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [258]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [259]:
df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,1
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,1
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,1
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,1
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,1


In [260]:
df['manual_label'].value_counts()

1    1374
2     517
0     109
Name: manual_label, dtype: int64

In [261]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,1
1,mutant ape yacht club bought floor h chg floor...,1
2,mutant ape yacht club mayc nft sold eth k,1
3,mutant ape yacht club mayc nft sold eth k,1
4,mutant ape yacht club sold eth nft collection ...,1


In [262]:
df.isnull().values.any()

False

In [263]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['manual_label'], test_size=0.2,random_state=42)

In [264]:
y_train.value_counts()

1    1097
2     415
0      88
Name: manual_label, dtype: int64

In [265]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [266]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=3)

In [267]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]==0:
    temps1=aug.augment(X_train[i],n=3)
    for sent in temps1:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append(0)

In [268]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(1864,)
(1864,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [269]:
y_train.value_counts()

1    1097
2     415
0     352
dtype: int64

In [270]:
embed_aug = EmbeddingAugmenter()

In [271]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]==0:
    temps3=embed_aug.augment(X_train[i])
    for sent in temps3:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append(0)

In [272]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(2216,)
(2216,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [273]:
y_train.value_counts()

1    1097
0     704
2     415
dtype: int64

In [274]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)

In [275]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(X_train_vectorized,y_train)
t1 = time.time()

y_pred=XGB_classifier.predict(X_test_vectorized)

time_linear_train = t1-t0

In [276]:
print("Training time: %fs" % (time_linear_train))

Training time: 2.892705s


In [277]:
print(accuracy_score(y_test, y_pred))

0.82


In [278]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.52      0.55        21
           1       0.89      0.89      0.89       277
           2       0.67      0.70      0.68       102

    accuracy                           0.82       400
   macro avg       0.71      0.70      0.71       400
weighted avg       0.82      0.82      0.82       400



## Embedding + Synonym

In [279]:
df = pd.read_csv('/content/drive/MyDrive/IR project/classifier/XGBoost/label_dataset_final.csv')

df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,pos
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,pos
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,pos
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,pos
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,pos


In [280]:
df['manual_label'] = df['manual_label'].replace('neg',0)

In [281]:
df['manual_label'] = df['manual_label'].replace('pos',1)

In [282]:
df['manual_label'] = df['manual_label'].replace('neu',2)

In [283]:
df.head()

Unnamed: 0,Datetime,Quarter,Likes,NFT,Text,clean_text,manual_label
0,2022-06-27,22,2,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#27894...,mutant ape yacht club mayc nft sold eth k,1
1,2023-02-11,31,2,Mutant Ape Yacht Club,🐳1 Mutant Ape Yacht Club bought for Ξ15.377\n\...,mutant ape yacht club bought floor h chg floor...,1
2,2022-12-30,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#77 so...,mutant ape yacht club mayc nft sold eth k,1
3,2022-12-29,24,0,Mutant Ape Yacht Club,🧪 Mutant Ape Yacht Club | #MAYC #NFT\n\n#23452...,mutant ape yacht club mayc nft sold eth k,1
4,2022-12-29,24,1,Mutant Ape Yacht Club,Mutant Ape Yacht Club #27908 sold for 17 ETH (...,mutant ape yacht club sold eth nft collection ...,1


In [284]:
df['manual_label'].value_counts()

1    1374
2     517
0     109
Name: manual_label, dtype: int64

In [285]:
df = df[['clean_text','manual_label']]

df.head()

Unnamed: 0,clean_text,manual_label
0,mutant ape yacht club mayc nft sold eth k,1
1,mutant ape yacht club bought floor h chg floor...,1
2,mutant ape yacht club mayc nft sold eth k,1
3,mutant ape yacht club mayc nft sold eth k,1
4,mutant ape yacht club sold eth nft collection ...,1


In [286]:
df.isnull().values.any()

False

In [287]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['manual_label'], test_size=0.2,random_state=42)

In [288]:
y_train.value_counts()

1    1097
2     415
0      88
Name: manual_label, dtype: int64

In [289]:
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [290]:
embed_aug = EmbeddingAugmenter()

In [291]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]==0:
    temps3=embed_aug.augment(X_train[i])
    for sent in temps3:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append(0)

In [292]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(1688,)
(1688,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [293]:
y_train.value_counts()

1    1097
2     415
0     176
dtype: int64

In [294]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=3)

In [295]:
augmented_sentences=[]
augmented_sentences_labels=[]
for i in X_train.index:
  if y_train[i]==0:
    temps1=aug.augment(X_train[i],n=3)
    for sent in temps1:
      augmented_sentences.append(sent)
      augmented_sentences_labels.append(0)

In [296]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

print(X_train.shape)
print(y_train.shape)

(2216,)
(2216,)


  X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
  y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [297]:
y_train.value_counts()

1    1097
0     704
2     415
dtype: int64

In [298]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)

In [299]:
XGB_classifier = XGBClassifier()

t0 = time.time()
XGB_classifier.fit(X_train_vectorized,y_train)
t1 = time.time()

y_pred=XGB_classifier.predict(X_test_vectorized)

time_linear_train = t1-t0

In [300]:
print("Training time: %fs" % (time_linear_train))

Training time: 6.281934s


In [301]:
print(accuracy_score(y_test, y_pred))

0.8025


In [302]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.48      0.50        21
           1       0.89      0.87      0.88       277
           2       0.64      0.69      0.66       102

    accuracy                           0.80       400
   macro avg       0.68      0.68      0.68       400
weighted avg       0.80      0.80      0.80       400

