In [2]:
from functions import *
from aug import *

import numpy as np
import pandas as pd
import pickle
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.manifold import TSNE



from keras import backend as K
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping


import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #get rid of warnings

2022-12-06 22:55:04.487433: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-06 22:55:04.788119: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-06 22:55:05.430231: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-06 22:55:05.430279: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

[H[2J

<input type='checkbox'> Train model on Original + Augmented data (with different methods) and test it </input> 

In [None]:
# load data
datasets = ['pc','cr','subj']
aug_methods = ['eda_augmenter','wordnet_augmenter','aeda_augmenter','backtranslation_augmenter','clare_augmenter']
n_samples = [1,2,4,8,10]
#path = f'data/{dataset}/train_{dataset}_{aug_method}_n_samples_{n_sample}.csv'


In [None]:

def create_X_matrix(dataset, w2v,word2vec_len=300, batch_size=25):
    dataset_size = len(dataset)
    x_matrix = np.zeros((dataset_size, batch_size, word2vec_len))    
    for i, line in enumerate(dataset):
        words = line.split()
        words = words[:batch_size] #cut off if too long
        for j, word in enumerate(words):
            if word in w2v:
                x_matrix[i, j, :] = w2v[word]

    return x_matrix


def build_model(batch_size=25, word2vec_len=300):
    model = None
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(batch_size, word2vec_len)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    # print(f'Accuracy: {acc}')
    # print(f'F1: {f1}')
    # print(f'Precision: {precision}')
    # print(f'Recall: {recall}')
    return acc, f1, precision, recall


def run(dataset_name,aug_method,n_sample):
    # Load data
    path_train_original = f'data/{dataset_name}/train.txt'
    path_train_aug = f'data/{dataset_name}/train_{dataset_name}_{aug_method}_n_sample_{n_sample}.csv'
    path_test = f'data/{dataset_name}/test.txt'
    train_aug = load_data(path_train_aug)
    train_original = load_data(path_train_original)
    test_data = load_data(path_test)
    X_train_aug, y_train_aug = train_aug['text'].values, train_aug['class'].values
    X_train_original, y_train_original = train_original['text'].values, train_original['class'].values
    X_test, y_test = test_data['text'].values, test_data['class'].values

    # load wor2vec pickle
    path_w2v = f'data/{dataset_name}/word2vec.p'
    w2v = pickle.load(open(path_w2v, 'rb'))


    # create matrices
    X_train_aug = create_X_matrix(X_train_aug, w2v)
    X_train_original = create_X_matrix(X_train_original, w2v)
    X_test = create_X_matrix(X_test, w2v)

    # Train model
    model_aug = build_model()
    model_original = build_model()

    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    model_aug.fit(X_train_aug, y_train_aug, epochs=1000,
                 callbacks=callbacks, validation_split=0.1,
                  batch_size=1024,shuffle=True, verbose=0)
    model_original.fit(X_train_original, y_train_original, epochs=1000,
                    callbacks=callbacks, validation_split=0.1,
                    batch_size=1024,shuffle=True, verbose=0)

    # Evaluate model
    acc_aug, f1_aug, precision_aug, recall_aug = evaluate_model(model_aug, X_test, y_test)
    acc_original, f1_original, precision_original, recall_original = evaluate_model(model_original, X_test, y_test)
    print(f'original model: \n acc: {acc_original} \n f1: {f1_original} \n precision: {precision_original} \n recall: {recall_original}')
    print(f'augmented model: \n acc: {acc_aug} \n f1: {f1_aug} \n precision: {precision_aug} \n recall: {recall_aug}')
    

    # Save model
    #model.save('model.h5')


In [None]:
run('cr','eda_augmenter',4)

In [None]:
df = load_data('data/cr/train_cr_eda_augmenter_n_sample_4.csv')

In [None]:
X_train_aug, y_train_aug = df['text'].values, df['class'].values


In [None]:
y_train_aug

In [None]:
from keras.utils.np_utils import to_categorical
y_train=to_categorical(y_train_aug)

In [None]:
y_train

In [None]:
from functions import *
from aug import *

import numpy as np
import pandas as pd
import pickle
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.manifold import TSNE

from keras import backend as K
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #get rid of warnings


def create_X_matrix(dataset, w2v,word2vec_len=300, batch_size=25):
    dataset_size = len(dataset)
    x_matrix = np.zeros((dataset_size, batch_size, word2vec_len))    
    for i, line in enumerate(dataset):
        words = line.split()
        words = words[:batch_size] #cut off if too long
        for j, word in enumerate(words):
            if word in w2v:
                x_matrix[i, j, :] = w2v[word]

    return x_matrix

from keras.optimizers import RMSprop
def build_model(batch_size=25, word2vec_len=300):
    model = None
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(batch_size, word2vec_len)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))

    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))              
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=1e-4), metrics=['acc'] )
    #print(model.summary())
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    #y_pred = to_categorical(y_pred.argmax(axis=1))
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    # print(f'Accuracy: {acc}')
    # print(f'F1: {f1}')
    # print(f'Precision: {precision}')
    # print(f'Recall: {recall}')
    return acc, f1, precision, recall


def run(dataset_name,aug_method,n_sample):
    # Load data
    print('Loading data...')
    path_train_original = f'data/{dataset_name}/train.txt'
    path_train_aug = f'data/{dataset_name}/train_{dataset_name}_{aug_method}_n_sample_{n_sample}.csv'
    path_test = f'data/{dataset_name}/test.txt'
    train_aug = load_data(path_train_aug)
    train_original = load_data(path_train_original)
    test_data = load_data(path_test)
    X_train_aug, y_train_aug = train_aug['text'].values, train_aug['class'].values.astype(float)
    X_train_original, y_train_original = train_original['text'].values, train_original['class'].values.astype(float)
    X_test, y_test = test_data['text'].values, test_data['class'].values.astype(float)

    # y_train_aug = to_categorical(y_train_aug)
    # y_train_original = to_categorical(y_train_original)
    # y_test = to_categorical(y_test)


    # load wor2vec pickle
    path_w2v = f'data/{dataset_name}/word2vec.p'
    w2v = pickle.load(open(path_w2v, 'rb'))


    # create matrices
    print('Creating matrices...')
    X_train_aug = create_X_matrix(X_train_aug, w2v)
    X_train_original = create_X_matrix(X_train_original, w2v)
    X_test = create_X_matrix(X_test, w2v)

    # Train model
    print('Training model...')
    model_aug = build_model()
    model_original = build_model()

    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    model_aug.fit(X_train_aug, y_train_aug, epochs=1000,
                 callbacks=callbacks, validation_split=0.1,
                  batch_size=1024,shuffle=True, verbose=0)
    model_original.fit(X_train_original, y_train_original, epochs=1000,
                    callbacks=callbacks, validation_split=0.1,
                    batch_size=1024,shuffle=True, verbose=0)

    # Evaluate model
    print('Evaluating model...')

    acc_aug, f1_aug, precision_aug, recall_aug = evaluate_model(model_aug, X_test, y_test)
    acc_original, f1_original, precision_original, recall_original = evaluate_model(model_original, X_test, y_test)
    print(f'original model: \n acc: {acc_original:.4f} \n f1: {f1_original:.4f} \n precision: {precision_original:.4f} \n recall: {recall_original:.4f}')
    print(f'augmented model: \n acc: {acc_aug:.4f} \n f1: {f1_aug:.4f} \n precision: {precision_aug:.4f} \n recall: {recall_aug:.4f}')

    print('finished')

# if __name__ == '__main__':

#     run('cr','eda_augmenter',4)


In [None]:
dataset_name,aug_method,n_sample = 'cr','eda_augmenter',4

print('Loading data...')
path_train_original = f'data/{dataset_name}/train.txt'
path_train_aug = f'data/{dataset_name}/train_{dataset_name}_{aug_method}_n_sample_{n_sample}.csv'
path_test = f'data/{dataset_name}/test.txt'
train_aug = load_data(path_train_aug)
train_original = load_data(path_train_original)
test_data = load_data(path_test)
X_train_aug, y_train_aug = train_aug['text'].values, train_aug['class'].values.astype(float)
X_train_original, y_train_original = train_original['text'].values, train_original['class'].values.astype(float)
X_test, y_test = test_data['text'].values, test_data['class'].values.astype(float)

# y_train_aug = to_categorical(y_train_aug)
# y_train_original = to_categorical(y_train_original)
# y_test = to_categorical(y_test)


# load wor2vec pickle
path_w2v = f'data/{dataset_name}/word2vec.p'
w2v = pickle.load(open(path_w2v, 'rb'))


# create matrices
print('Creating matrices...')
X_train_aug = create_X_matrix(X_train_aug, w2v)
X_train_original = create_X_matrix(X_train_original, w2v)
X_test = create_X_matrix(X_test, w2v)

# Train model
print('Training model...')
model_aug = build_model()
model_original = build_model()

callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

model_aug.fit(X_train_aug, y_train_aug, epochs=1000,
                callbacks=callbacks, validation_split=0.1,
                batch_size=1024,shuffle=True, verbose=0)
model_original.fit(X_train_original, y_train_original, epochs=1000,
                callbacks=callbacks, validation_split=0.1,
                batch_size=1024,shuffle=True, verbose=0)

# Evaluate model
print('Evaluating model...')

# acc_aug, f1_aug, precision_aug, recall_aug = evaluate_model(model_aug, X_test, y_test)
# acc_original, f1_original, precision_original, recall_original = evaluate_model(model_original, X_test, y_test)
# print(f'original model: \n acc: {acc_original:.4f} \n f1: {f1_original:.4f} \n precision: {precision_original:.4f} \n recall: {recall_original:.4f}')
# print(f'augmented model: \n acc: {acc_aug:.4f} \n f1: {f1_aug:.4f} \n precision: {precision_aug:.4f} \n recall: {recall_aug:.4f}')

print('finished')

In [None]:
pred = model_original.predict(X_test)
y = to_categorical(y_test)

In [None]:
for i, j in zip(pred, y):
    print(i, j)

In [None]:
'HANDY AND : YOU CAN USE IT ALMOST ANYWHERE'.split()

In [None]:
from functions import *
import numpy as np
import pandas as pd
import pickle
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.manifold import TSNE

from keras import backend as K
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical
from keras.metrics import Precision, Recall, AUC
from keras.optimizers import RMSprop, Adam

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #get rid of warnings



def create_y_matrix(y_data):
  y_matrix = np.zeros((len(y_data),2))
  for count,i in enumerate(y_data):
    if i == 1:
      y_matrix[count][1] = 1.0
    else:
      y_matrix[count][0] = 1.0
  return y_matrix

def create_X_matrix(dataset, w2v,word2vec_len=300, batch_size=25):
    dataset_size = len(dataset)
    x_matrix = np.zeros((dataset_size, batch_size, word2vec_len))    
    for i, line in enumerate(dataset):
        # print(line)
        # print(i)
        try:
            words = line.split()
            words = words[:batch_size] #cut off if too long
            for j, word in enumerate(words):
                if word in w2v:
                    x_matrix[i, j, :] = w2v[word]
        except:
            pass
    return x_matrix


def build_model(batch_size=25, word2vec_len=300):
    model = None
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(batch_size, word2vec_len)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))

    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))              
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc',Precision(),Recall(),AUC()] )
    return model

def evaluate_model(model, X_test, y_test):
    eval = model.evaluate(X_test, y_test)
    loss = eval[0]
    accuracy = eval[1]
    precision = eval[2]
    recall = eval[3]
    auc = eval[4]    
    f1_score = (2*precision*recall)/(precision+recall)
    # print("Loss: ", loss)
    # print("Accuracy: ", accuracy)
    # print("Precision: ", precision)
    # print("Recall: ", recall)
    # print("AUC: ", auc)
    # print ('f1 score: ', f1_score)
    return loss, accuracy, precision, recall, auc, f1_score


def run(dataset_name,aug_method,n_sample):
    # Load data
    print('Loading data...')
    path_train_original = f'data/{dataset_name}/train.txt'
    path_train_aug = f'data/{dataset_name}/train_{dataset_name}_{aug_method}_n_sample_{n_sample}.csv'
    path_test = f'data/{dataset_name}/test.txt'
    train_aug = load_data(path_train_aug)
    train_original = load_data(path_train_original)
    test_data = load_data(path_test)
    X_train_aug, y_train_aug = train_aug['text'].values, train_aug['class'].values.astype(float)
    X_train_original, y_train_original = train_original['text'].values, train_original['class'].values.astype(float)
    X_test, y_test = test_data['text'].values, test_data['class'].values.astype(float)

    # y_train_aug = to_categorical(y_train_aug)
    # y_train_original = to_categorical(y_train_original)
    # y_test = to_categorical(y_test)


    # load wor2vec pickle
    path_w2v = f'data/{dataset_name}/word2vec.p'
    w2v = pickle.load(open(path_w2v, 'rb'))


    # create matrices
    print('Creating matrices...')
    X_train_aug = create_X_matrix(X_train_aug, w2v)
    X_train_original = create_X_matrix(X_train_original, w2v)
    X_test = create_X_matrix(X_test, w2v)

    # Train model
    print('Training model...')
    model_aug = build_model()
    model_original = build_model()

    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    model_aug.fit(X_train_aug, y_train_aug, epochs=1000,
                 callbacks=callbacks, validation_split=0.1,
                  batch_size=1024,shuffle=True, verbose=0)
    model_original.fit(X_train_original, y_train_original, epochs=1000,
                    callbacks=callbacks, validation_split=0.1,
                    batch_size=1024,shuffle=True, verbose=0)

    # Evaluate model
    print('Evaluating model...')

    loss_org, accuracy_org, precision_org, recall_org, auc_org, f1_score_org = evaluate_model(model_original, X_test, y_test)
    loss_aug, accuracy_aug, precision_aug, recall_aug, auc_aug, f1_score_aug = evaluate_model(model_aug, X_test, y_test)
    print(f'original model: \n acc: {accuracy_org:.4f} \n f1: {f1_score_org:.4f} \n precision: {precision_org:.4f} \n recall: {recall_org:.4f}, \n auc: {auc_org:.4f}')
    print(f'augmented model: \n acc: {accuracy_aug:.4f} \n f1: {f1_score_aug:.4f} \n precision: {precision_aug:.4f} \n recall: {recall_aug:.4f}, \n auc: {auc_aug:.4f}')

    print('finished')



run('pc','aeda_augmenter',4)

In [None]:
from functions import *
import numpy as np
import pandas as pd
import pickle
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.manifold import TSNE

from keras import backend as K
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.models import Sequential, load_model, Model
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical
from keras.metrics import Precision, Recall, AUC
from keras.optimizers import RMSprop, Adam
import runai.ga.keras


import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #get rid of warnings



def create_y_matrix(y_data):
  y_matrix = np.zeros((len(y_data),2))
  for count,i in enumerate(y_data):
    if i == 1:
      y_matrix[count][1] = 1.0
    else:
      y_matrix[count][0] = 1.0
  return y_matrix

def create_X_matrix(dataset, w2v,word2vec_len=300, batch_size=25):
    dataset_size = len(dataset)
    x_matrix = np.zeros((dataset_size, batch_size, word2vec_len))    
    for i, line in enumerate(dataset):
        try:
            words = line.split()
            words = words[:batch_size] #cut off if too long
            for j, word in enumerate(words):
                if word in w2v:
                    x_matrix[i, j, :] = w2v[word]
        except:
            pass
    return x_matrix


def build_model(batch_size=25, word2vec_len=300):
    model = None
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(batch_size, word2vec_len)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(32, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))

    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))  

    STEPS = 10000
    optimizer = runai.ga.keras.optimizers.Optimizer(Adam(), steps=STEPS)            
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc',Precision(),Recall(),AUC()] )
    return model

def evaluate_model(model, X_test, y_test):
    eval = model.evaluate(X_test, y_test)
    loss = eval[0]
    accuracy = eval[1]
    precision = eval[2]
    recall = eval[3]
    auc = eval[4]    
    f1_score = (2*precision*recall)/(precision+recall)
    # print("Loss: ", loss)
    # print("Accuracy: ", accuracy)
    # print("Precision: ", precision)
    # print("Recall: ", recall)
    # print("AUC: ", auc)
    # print ('f1 score: ', f1_score)
    return loss, accuracy, precision, recall, auc, f1_score


def run(dataset_name,aug_method,n_sample):
    # Load data
    print('Loading data...')
    path_train_original = f'data/{dataset_name}/train.txt'
    path_train_aug = f'data/{dataset_name}/train_{dataset_name}_{aug_method}_n_sample_{n_sample}.csv'
    path_test = f'data/{dataset_name}/test.txt'
    train_aug = load_data(path_train_aug)
    train_original = load_data(path_train_original)
    test_data = load_data(path_test)
    X_train_aug, y_train_aug = train_aug['text'].values, train_aug['class'].values.astype(float)
    X_train_original, y_train_original = train_original['text'].values, train_original['class'].values.astype(float)
    X_test, y_test = test_data['text'].values, test_data['class'].values.astype(float)

    # y_train_aug = to_categorical(y_train_aug)
    # y_train_original = to_categorical(y_train_original)
    # y_test = to_categorical(y_test)


    # load wor2vec pickle
    path_w2v = f'data/{dataset_name}/word2vec.p'
    w2v = pickle.load(open(path_w2v, 'rb'))


    # create matrices
    print('Creating matrices...')
    X_train_aug = create_X_matrix(X_train_aug, w2v)
    X_train_original = create_X_matrix(X_train_original, w2v)
    X_test = create_X_matrix(X_test, w2v)

    # Train model
    print('Training model...')
    model_aug = build_model()
    model_original = build_model()

    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    model_aug.fit(X_train_aug, y_train_aug, epochs=1000,
                 callbacks=callbacks, validation_split=0.1,
                  batch_size=128,shuffle=True, verbose=0)
    model_original.fit(X_train_original, y_train_original, epochs=1000,
                    callbacks=callbacks, validation_split=0.1,
                    batch_size=128,shuffle=True, verbose=0)

    # Evaluate model
    print('Evaluating model...')

    loss_org, accuracy_org, precision_org, recall_org, auc_org, f1_score_org = evaluate_model(model_original, X_test, y_test)
    loss_aug, accuracy_aug, precision_aug, recall_aug, auc_aug, f1_score_aug = evaluate_model(model_aug, X_test, y_test)
    print(f'original model: \n acc: {accuracy_org:.4f} \n f1: {f1_score_org:.4f} \n precision: {precision_org:.4f} \n recall: {recall_org:.4f}, \n auc: {auc_org:.4f}')
    print(f'augmented model: \n acc: {accuracy_aug:.4f} \n f1: {f1_score_aug:.4f} \n precision: {precision_aug:.4f} \n recall: {recall_aug:.4f}, \n auc: {auc_aug:.4f}')

    print('finished')

In [None]:
run('pc','aeda_augmenter',8)

In [None]:
1024//100

## compare results

In [None]:
pc_aeda_large = pd.read_csv('results/pc_aeda_compare_with_large_w2v.csv', index_col=0)
pc_aeda_small = pd.read_csv('results/pc_aeda_compare_with_small_w2v.csv', index_col=0)
pc_eda_large = pd.read_csv('results/pc_eda_compare_with_large_w2v.csv', index_col=0)
pc_eda_small = pd.read_csv('results/pc_eda_compare_with_small_w2v.csv', index_col=0)
pc_wordnet_large = pd.read_csv('results/pc_wordnet_compare_with_large_w2v.csv', index_col=0)
pc_wordnet_small = pd.read_csv('results/pc_wordnet_compare_with_small_w2v.csv', index_col=0)

In [None]:
list_of_results = [pc_aeda_large,pc_aeda_small,pc_eda_large ,pc_eda_small ,pc_wordnet_large,pc_wordnet_small]
list_of_features = ['loss','accuracy','precision','recall','auc''f1_score']


In [None]:
df = pd.concat(list_of_results)
df = df.groupby(df.index,sort=False).mean()
org_series = df['original']

In [None]:
for i in list_of_results:
    i['original'] = org_series

In [None]:
for i in list_of_results:
    i.to_csv(f'results/{i.index[0]}.csv')

In [4]:
pc_aeda_large.to_excel('results/pc_aeda_large_w2v.xlsx')
pc_aeda_small.to_excel('results/pc_aeda_small_w2v.xlsx')
pc_eda_large.to_excel('results/pc_eda_large_w2v.xlsx')
pc_eda_small.to_excel('results/pc_eda_small_w2v.xlsx')
pc_wordnet_large.to_excel('results/pc_wordnet_large_w2v.xlsx')
pc_wordnet_small.to_excel('results/pc_wordnet_small_w2v.xlsx')

In [None]:
pc_eda_small

In [None]:
df = pd.read_csv('results/pc_aeda_large_w2v.csv', index_col=0)

In [None]:
df

In [None]:
df = pd.read_csv('results/pc_aeda_small_w2v.csv', index_col=0)

In [None]:
df

In [3]:
pc_aeda_large = pd.read_csv('results/pc_aeda_large_w2v.csv', index_col=0)
pc_aeda_small = pd.read_csv('results/pc_aeda_small_w2v.csv', index_col=0)
pc_eda_large = pd.read_csv('results/pc_eda_large_w2v.csv', index_col=0)
pc_eda_small = pd.read_csv('results/pc_eda_small_w2v.csv', index_col=0)
pc_wordnet_large = pd.read_csv('results/pc_wordnet_large_w2v.csv', index_col=0)
pc_wordnet_small = pd.read_csv('results/pc_wordnet_small_w2v.csv', index_col=0)

In [None]:
print(pc_aeda_large.to_markdown())

In [None]:
print(pc_aeda_small.to_markdown())

In [None]:
print(pc_eda_large.to_markdown())

In [None]:
print(pc_eda_small.to_markdown())

In [None]:
print(pc_wordnet_large.to_markdown())

In [None]:
print(pc_wordnet_small.to_markdown())

In [7]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go


In [8]:
pc_aeda_large = pd.read_csv('results/pc_aeda_large_w2v.csv', index_col=0)
pc_aeda_small = pd.read_csv('results/pc_aeda_small_w2v.csv', index_col=0)
pc_eda_large = pd.read_csv('results/pc_eda_large_w2v.csv', index_col=0)
pc_eda_small = pd.read_csv('results/pc_eda_small_w2v.csv', index_col=0)
pc_wordnet_large = pd.read_csv('results/pc_wordnet_large_w2v.csv', index_col=0)
pc_wordnet_small = pd.read_csv('results/pc_wordnet_small_w2v.csv', index_col=0)

In [9]:
def vis(df, title):
    df = df.drop('loss',axis=0)
    df['average']= df[['n_sample_1','n_sample_2','n_sample_4','n_sample_8','n_sample_10']].mean(axis=1)
    fig = go.Figure()
    for i in df.columns:
        fig.add_trace(go.scatter(x=df.index, y=df[i], name=i))
    fig.update_layout(title=title)
    fig.show()

In [10]:
vis(pc_aeda_large, 'pc_aeda_large')

TypeError: 'module' object is not callable

In [5]:
vis(pc_eda_large, 'pc_eda_large')

In [6]:
vis(pc_wordnet_large, 'pc_wordnet_large')

In [7]:
vis(pc_wordnet_small, 'pc_wordnet_small')

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [24]:
aeda = pd.read_csv('results/1000sample/compare_1000_pc_aeda_augmenter_org_True.csv', index_col=0)
eda = pd.read_csv('results/1000sample/compare_1000_pc_eda_augmenter_org_True.csv', index_col=0)
wordnet = pd.read_csv('results/1000sample/compare_1000_pc_wordnet_augmenter_org_True.csv', index_col=0)
backtranslation = pd.read_csv('results/1000sample/compare_1000_pc_backtranslation_augmenter_org_True.csv', index_col=0)
clare = pd.read_csv('results/1000sample/compare_1000_pc_clare_augmenter_org_True.csv', index_col=0)

In [25]:
clare

Unnamed: 0,n_sample_1,n_sample_2,n_sample_4,n_sample_8,original
loss,0.5079,0.561711,0.613556,0.635525,0.486834
accuracy,0.846079,0.838558,0.840969,0.842921,0.83741
precision,0.865977,0.840792,0.836515,0.861232,0.82832
recall,0.81688,0.833507,0.8469,0.818035,0.848978
auc,0.916396,0.908464,0.907928,0.904523,0.916809
f1_score,0.840708,0.836981,0.841207,0.838231,0.838522


In [16]:
def vis(df, title):
    df = df.drop('loss',axis=0)
    df['average']= df[['n_sample_1','n_sample_2','n_sample_4','n_sample_8']].mean(axis=1)
    fig = go.Figure()
    for i in df.columns:
        fig.add_trace(go.Scatter(x=df.index, y=df[i], name=i))
    fig.update_layout(title=title)
    fig.show()

In [26]:
def vis(df, title):
    df = df.drop('loss',axis=0)
    #df['average']= df[['n_sample_1','n_sample_2','n_sample_4','n_sample_8']].mean(axis=1)
    fig = go.Figure()
    for i in df.columns:
        fig.add_trace(go.Bar(x=df.index, y=df[i], name=i))
    fig.update_layout(title=title)
    fig.show()

In [16]:

vis(eda, 'aeda 1000 sample')


In [8]:
eda.applymap(lambda x: round(x, 2))

Unnamed: 0,n_sample_1,n_sample_2,n_sample_4,n_sample_8,original
loss,0.61,0.52,0.65,0.63,0.47
accuracy,0.84,0.84,0.82,0.85,0.84
precision,0.82,0.84,0.79,0.87,0.82
recall,0.86,0.84,0.88,0.83,0.88
auc,0.91,0.92,0.91,0.91,0.92
f1_score,0.84,0.84,0.83,0.85,0.85


In [27]:
for i in eda.columns:
    if i != 'original':
        eda[i] = np.absolute(eda[i] - eda['original'])
eda

Unnamed: 0,n_sample_1,n_sample_2,n_sample_4,n_sample_8,original
loss,0.138631,0.051774,0.181311,0.166421,0.467889
accuracy,0.005167,0.002985,0.020209,0.011023,0.840338
precision,0.006777,0.027493,0.029766,0.051907,0.815722
recall,0.020321,0.035446,0.001501,0.049648,0.877035
auc,0.015516,0.00148,0.017306,0.012727,0.922813
f1_score,0.007248,0.002925,0.015861,0.001721,0.845268


In [12]:
eda

Unnamed: 0,n_sample_1,n_sample_2,n_sample_4,n_sample_8,original
loss,0.60652,0.519663,0.6492,0.63431,0.467889
accuracy,0.835171,0.843323,0.820129,0.851361,0.840338
precision,0.822498,0.843214,0.785956,0.867628,0.815722
recall,0.856714,0.841589,0.878536,0.827387,0.877035
auc,0.907297,0.921333,0.905507,0.910086,0.922813
f1_score,0.83802,0.842343,0.829407,0.846988,0.845268


In [13]:
for i in eda.columns:
    print(i)

n_sample_1
n_sample_2
n_sample_4
n_sample_8
original


In [28]:

vis(eda.drop('original',axis=1), 'aeda 1000 sample')