# C  N  N
# ===========================================================

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
import tensorflow_addons as tfa
from datetime import datetime
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import os
import json
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(100)
import random
random.seed(100)
tf.random.set_seed(100)

class CNN:
    def __init__(self,dims,w2v_path,max_seq_len=20,batch_size=128,epochs=20):
        self.dims = dims
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.epochs = epochs
        with open(w2v_path, 'rb') as f:            
            self.w2v = pickle.load(f)
        self.model = None        
        self.label_mapping = None
        self.n_classes = None
        self.history = None
        self.metrics = None #[tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        log_dir = f"logs/fit/run_only_once" + datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
        decay_rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto', min_delta=0.0001 ,min_lr=0.00001)
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto', restore_best_weights=True)
        self.callbacks = [tensorboard_callback, decay_rate, early_stopping]


    def build_cnn(self):
        if self.n_classes > 2:
            loss = 'categorical_crossentropy'
            activation = 'softmax'
        else:
            loss = 'binary_crossentropy'
            activation = 'sigmoid'

        input_layer = layers.Input(shape=(self.max_seq_len, 300))
        conv1_1 = layers.Conv1D(128, 4, activation='relu', padding='same')(input_layer)
        conv1_2 = layers.Conv1D(128, 5, activation='relu', padding='same')(conv1_1)
        #conv1_3 = layers.Conv1D(128, 5, activation='relu', padding='same')(conv1_2)
        conv_out = layers.Concatenate(axis=1)([conv1_1, conv1_2])

        dropout_rate = 0.5
        dropout_out1 = layers.Dropout(dropout_rate)(conv_out)

        pool_out = layers.MaxPool1D(pool_size=self.max_seq_len, padding='valid')(dropout_out1)
        flatten_out = layers.Flatten()(pool_out)
        dropout_out2 = layers.Dropout(dropout_rate)(flatten_out)
        dense_out = layers.Dense(self.n_classes, activation=activation, kernel_regularizer=regularizers.L2(0.001))(dropout_out2)
        
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        cnn_model = Model(inputs=input_layer, outputs=dense_out)
        cnn_model.compile(optimizer='adam', loss=loss, metrics=self.metrics)
        #cnn_model.summary()
        self.model = cnn_model
        
    def insert_values(self,train_path,test_path):    
        def insert(df):
            
            # initialize x self.and y matrices
            num_lines = len(df)
            self.n_classes = df['class'].nunique()          
            x_matrix = np.zeros((num_lines, self.max_seq_len ,300))
            y_matrix = np.zeros((num_lines, self.n_classes))


            # insert values
            for i, row in df.iterrows():
                label = row[0]
                sentence = row[1]
                if isinstance(sentence, str):
                    words = sentence.split()[:self.max_seq_len]
                    for j, word in enumerate(words):
                        if word in self.w2v:
                            x_matrix[i, j, :] = self.w2v[word]
                else:
                    continue        
                y_matrix[i,label] = 1.0    

            return x_matrix,y_matrix
        
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        self.n_classes = train_df['class'].nunique()        
        unique_classes = train_df['class'].unique()
        labels_map = dict(zip(unique_classes, range(self.n_classes)))

        train_df['class'] = train_df['class'].map(labels_map)
        test_df['class'] = test_df['class'].map(labels_map)

        train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=100)
        print(f'Train size: {len(train_df)}\nValidation size: {len(val_df)}\nTest size: {len(test_df)}')

        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)
        val_df = val_df.sample(frac=1,random_state=100).reset_index(drop=True)

        train_x, train_y = insert(train_df)
        test_x, test_y = insert(test_df)
        val_x, val_y = insert(val_df)

        return train_x, train_y, test_x, test_y, val_x, val_y, self.n_classes          

    def fit(self,train_x, train_y,  val_x, val_y):
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        self.build_cnn()  
        self.history = self.model.fit(train_x, train_y, batch_size=self.batch_size, epochs=self.epochs, validation_data=(val_x, val_y), callbacks=self.callbacks, verbose=0)
        return self.history
    def evaluate(self,test_x, test_y):
        return self.model.evaluate(test_x, test_y,return_dict=True)



    def run_n_times(self,train_x, train_y, test_x, test_y, val_x, val_y, dataset_name, n=3):
            hist_dict = {}
            res_dict = {}
            best_val_loss = float('inf')
            for i in range(n):
                print(f'Run {i+1} of {n}')
                self.fit(train_x, train_y, val_x, val_y)
                res = self.evaluate(test_x, test_y)
                res_dict[i+1] = res
                if self.history.history['val_loss'][-1] < best_val_loss:
                    best_val_loss = self.history.history['val_loss'][-1]
                    self.model.save(f"models/{dataset_name}_best_model.h5")
                self.model.set_weights([np.zeros(w.shape) for w in self.model.get_weights()])
            
            avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4)  for metric in res_dict[1].keys()}
            
            # Save the average results to disk
            os.makedirs("results", exist_ok=True)
            with open(f"results/{dataset_name}_avg_results.txt", "w") as f:
                for key, value in avg_dict.items():
                    f.write(f"{key}: {value}\n")
            
            K.clear_session()
            
            return hist_dict, res_dict, avg_dict

    



In [None]:
class CNN:
    def __init__(self,dims,w2v_path,max_seq_len=20,batch_size=128,epochs=20,batch_size_insert=1000):
        self.dims = dims
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.batch_size_insert = batch_size_insert
        self.epochs = epochs
        with open(w2v_path, 'rb') as f:            
            self.w2v = pickle.load(f)
        self.model = None        
        self.label_mapping = None
        self.n_classes = None
        self.history = None
        self.metrics = None #[tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']        
        self.callbacks =None
        


    def build_cnn(self):
        if self.n_classes > 2:
            loss = 'categorical_crossentropy'
            activation = 'softmax'
        else:
            loss = 'binary_crossentropy'
            activation = 'sigmoid'

        input_layer = layers.Input(shape=(self.max_seq_len, 300))
        conv1_1 = layers.Conv1D(128, 4, activation='relu', padding='same')(input_layer)
        conv1_2 = layers.Conv1D(128, 5, activation='relu', padding='same')(conv1_1)
        #conv1_3 = layers.Conv1D(128, 5, activation='relu', padding='same')(conv1_2)
        conv_out = layers.Concatenate(axis=1)([conv1_1, conv1_2])

        dropout_rate = 0.5
        dropout_out1 = layers.Dropout(dropout_rate)(conv_out)

        pool_out = layers.MaxPool1D(pool_size=self.max_seq_len, padding='valid')(dropout_out1)
        flatten_out = layers.Flatten()(pool_out)
        dropout_out2 = layers.Dropout(dropout_rate)(flatten_out)
        dense_out = layers.Dense(self.n_classes, activation=activation, kernel_regularizer=regularizers.L2(0.001))(dropout_out2)
        
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        cnn_model = Model(inputs=input_layer, outputs=dense_out)
        cnn_model.compile(optimizer='adam', loss=loss, metrics=self.metrics)
        #cnn_model.summary()
        self.model = cnn_model
        
    def insert_values(self,train_path,test_path):    
        def insert(df):
            
            # initialize x self.and y matrices
            num_lines = len(df)
            self.n_classes = df['class'].nunique()          
            x_matrix =        np.zeros((num_lines, self.max_seq_len ,300))
            y_matrix = np.zeros((num_lines, self.n_classes))


            # insert values
            for i in range(0, num_lines, self.batch_size_insert):
                df_batch = df.iloc[i:i+self.batch_size_insert]
                batch_size = len(df_batch)
                x_batch = np.zeros((batch_size, self.max_seq_len, 300))
                y_batch = np.zeros((batch_size, self.n_classes))

                for j, row in df_batch.iterrows():
                    label = row[0]
                    sentence = row[1]
                    if isinstance(sentence, str):
                        words = sentence.split()[:self.max_seq_len]
                        for k, word in enumerate(words):
                            if word in self.w2v:
                                x_batch[j-i, k, :] = self.w2v[word]
                    else:
                        continue        
                    y_batch[j-i,label] = 1.0

                x_matrix[i:i+batch_size] = x_batch
                y_matrix[i:i+batch_size] = y_batch

            return x_matrix,y_matrix
    
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        self.n_classes = train_df['class'].nunique()        
        unique_classes = train_df['class'].unique()
        labels_map = dict(zip(unique_classes, range(self.n_classes)))

        train_df['class'] = train_df['class'].map(labels_map)
        test_df['class'] = test_df['class'].map(labels_map)

        train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=100)
        print(f'Train size: {len(train_df)}\nValidation size: {len(val_df)}\nTest size: {len(test_df)}')

        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)
        val_df = val_df.sample(frac=1,random_state=100).reset_index(drop=True)

        train_x, train_y = insert(train_df)
        test_x, test_y = insert(test_df)
        val_x, val_y = insert(val_df)

        return train_x, train_y, test_x, test_y, val_x, val_y, self.n_classes          

    def fit(self,train_x, train_y,  val_x, val_y):
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        self.build_cnn()  
        self.history = self.model.fit(train_x, train_y, batch_size=self.batch_size, epochs=self.epochs, validation_data=(val_x, val_y), callbacks=self.callbacks, verbose=0)
        return self.history
    def evaluate(self,test_x, test_y):
        return self.model.evaluate(test_x, test_y,return_dict=True)



    def run_n_times(self,train_x, train_y, test_x, test_y, val_x, val_y, dataset_name, n=3):
        
        log_dir = f"logs/fit/{dataset_name}/" + datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
        decay_rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto', min_delta=0.0001 ,min_lr=0.00001)
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto', restore_best_weights=True)
        self.callbacks = [tensorboard_callback, decay_rate, early_stopping]

        hist_dict = {}
        res_dict = {}
        best_val_loss = float('inf')
        for i in range(n):
            print(f'Run {i+1} of {n}')
            try:
                self.fit(train_x, train_y, val_x, val_y)
            except tf.errors.ResourceExhaustedError:
                K.clear_session()
                self.model = None
                self.build_cnn()
                continue
            res = self.evaluate(test_x, test_y)
            res_dict[i+1] = res
            if self.history.history['val_loss'][-1] < best_val_loss:
                best_val_loss = self.history.history['val_loss'][-1]
                self.model.save(f"models/{dataset_name}_best_model.h5")
            self.model.set_weights([np.zeros(w.shape) for w in self.model.get_weights()])
        
        avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4)  for metric in res_dict[1].keys()}
        
        # Save the average results to disk
        os.makedirs("results", exist_ok=True)
        with open(f"results/{dataset_name}_avg_results.txt", "w") as f:
            for key, value in avg_dict.items():
                f.write(f"{key}: {value}\n")
        
        K.clear_session()
        
        return hist_dict, res_dict, avg_dict


In [None]:
train_path  = 'data/original/agnews/train.csv'
test_path   = 'data/original/agnews/test.csv'
w2v_path = 'w2v.pkl'
name = 'agnews'
max_seq_len = 150
batch_size = 8
epochs = 30
cnn = CNN(dims=300, w2v_path=w2v_path, max_seq_len=20, batch_size=128, epochs=20)
train_x, train_y, test_x, test_y, val_x, val_y, n_classes = cnn.insert_values(train_path,test_path)
hist_dict, res_dict, avg_dict = cnn.run_n_times(train_x, train_y, test_x, test_y, val_x, val_y, name, n=3)


# model = CNN(dims=300, max_seq_len=max_seq_len, batch_size=batch_size, epochs=epochs, w2v_path=w2v_path)
# train_x, train_y, test_x, test_y, val_x, val_y, n_classes = model.insert_values(train_path,test_path)
# his,res,avg = model.run_n_times(train_x, train_y, test_x, test_y, val_x, val_y, n=3)
# print (avg)

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
import tensorflow_addons as tfa
from datetime import datetime
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import pickle
import os
import json
os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(100)
import random
random.seed(100)
tf.random.set_seed(100)



class LSTM:
    def __init__(self,dims,w2v_path,max_seq_len=20,batch_size=128,epochs=20,chunk_size=1000):
        self.dims = dims
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.chunk_size = chunk_size
        self.epochs = epochs
        with open(w2v_path, 'rb') as f:            
            self.w2v = pickle.load(f)
        self.model = None                
        self.n_classes = None
        self.history = None
        self.metrics = None #[tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']        
        self.callbacks =None
        

    def build_lstm(self):
        if self.n_classes > 2:
            loss = 'categorical_crossentropy'
            activation = 'softmax'
        else:
            loss = 'binary_crossentropy'
            activation = 'sigmoid'

        input_layer = layers.Input(shape=(self.max_seq_len, 300))
        lstm_1 = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(input_layer)
        dropout_rate = 0.5
        dropout_out1 = layers.Dropout(dropout_rate)(lstm_1)
        lstm_2 = layers.Bidirectional(layers.LSTM(32, return_sequences=False))(dropout_out1)
        dropout_out2 = layers.Dropout(dropout_rate)(lstm_2)
        dense_1 = layers.Dense(20, activation='relu')(dropout_out2)
        dense_out = layers.Dense(self.n_classes, activation=activation, kernel_regularizer=regularizers.L2(0.001))(dense_1)
        
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        lstm_model = Model(inputs=input_layer, outputs=dense_out)
        lstm_model.compile(optimizer='adam', loss=loss, metrics=self.metrics)
        #lstm_model.summary()
        self.model = lstm_model
        
    def insert_values(self,train_path,test_path):    
        def insert(df):
            
            # initialize x self.and y matrices
            num_lines = len(df)
            self.n_classes = df['class'].nunique()          
            x_matrix = np.zeros((num_lines, self.max_seq_len ,300))
            y_matrix = np.zeros((num_lines, self.n_classes))


            # insert values
            for i in range(0, num_lines, self.chunk_size):
                df_batch = df.iloc[i:i+self.chunk_size]
                batch_size = len(df_batch)
                x_batch = np.zeros((batch_size, self.max_seq_len, 300))
                y_batch = np.zeros((batch_size, self.n_classes))

                for j, row in df_batch.iterrows():
                    label = row[0]
                    sentence = row[1]
                    if isinstance(sentence, str):
                        words = sentence.split()[:self.max_seq_len]
                        for k, word in enumerate(words):
                            if word in self.w2v:
                                x_batch[j-i, k, :] = self.w2v[word]
                    else:
                        continue        
                    y_batch[j-i,label] = 1.0

                x_matrix[i:i+batch_size] = x_batch
                y_matrix[i:i+batch_size] = y_batch

            return x_matrix,y_matrix
    
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        self.n_classes = train_df['class'].nunique()        
        unique_classes = train_df['class'].unique()
        labels_map = dict(zip(unique_classes, range(self.n_classes)))

        train_df['class'] = train_df['class'].map(labels_map)
        test_df['class'] = test_df['class'].map(labels_map)

        train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=100)
        print(f'Train size: {len(train_df)}\nValidation size: {len(val_df)}\nTest size: {len(test_df)}')

        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)
        val_df = val_df.sample(frac=1,random_state=100).reset_index(drop=True)

        train_x, train_y = insert(train_df)
        test_x, test_y = insert(test_df)
        val_x, val_y = insert(val_df)

        return train_x, train_y, test_x, test_y, val_x, val_y, self.n_classes          

    def fit(self,train_x, train_y,  val_x, val_y):
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        self.build_lstm()  
        self.history = self.model.fit(train_x, train_y, batch_size=self.batch_size, epochs=self.epochs, validation_data=(val_x, val_y), callbacks=self.callbacks, verbose=0)
        return self.history
    def evaluate(self,test_x, test_y):
        return self.model.evaluate(test_x, test_y,return_dict=True)



    def run_n_times(self,train_x, train_y, test_x, test_y, val_x, val_y, dataset_name, n=3):
        
        log_dir = f"logs/fit/{dataset_name}/" + datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
        decay_rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto', min_delta=0.0001 ,min_lr=0.00001)
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto', restore_best_weights=True)
        self.callbacks = [tensorboard_callback, decay_rate, early_stopping]

        hist_dict = {}
        res_dict = {}
        best_val_loss = float('inf')
        for i in range(n):
            print(f'Run {i+1} of {n}')
            try:
                self.fit(train_x, train_y, val_x, val_y)
            except tf.errors.ResourceExhaustedError:
                K.clear_session()
                self.model = None
                self.build_lstm()
                continue
            res = self.evaluate(test_x, test_y)
            res_dict[i+1] = res
            if self.history.history['val_loss'][-1] < best_val_loss:
                best_val_loss = self.history.history['val_loss'][-1]
                self.model.save(f"models/{dataset_name}_best_model.h5")
            self.model.set_weights([np.zeros(w.shape) for w in self.model.get_weights()])
        
        avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4)  for metric in res_dict[1].keys()}
        
        # Save the average results to disk
        os.makedirs("results", exist_ok=True)
        with open(f"results/{dataset_name}_avg_results.txt", "w") as f:
            for key, value in avg_dict.items():
                f.write(f"{key}: {value}\n")
        
        K.clear_session()
        
        return hist_dict, res_dict, avg_dict


In [None]:
train_path  = 'data/original/cr/train.csv'
test_path   = 'data/original/cr/test.csv'
w2v_path = 'w2v.pkl'
name = 'cr'
max_seq_len = 64
batch_size = 128
epochs = 30
lstm = LSTM(dims=300, w2v_path=w2v_path, max_seq_len=max_seq_len, batch_size=batch_size, epochs=epochs)
train_x, train_y, test_x, test_y, val_x, val_y, n_classes = lstm.insert_values(train_path,test_path)
hist_dict, res_dict, avg_dict = lstm.run_n_times(train_x, train_y, test_x, test_y, val_x, val_y, name, n=3)

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
import tensorflow_addons as tfa
from datetime import datetime
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle
import os
import json
import random

os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(100)
random.seed(100)
tf.random.set_seed(100)


class CNN:
    def __init__(self, dims, w2v_path, max_seq_len=20, batch_size=128, epochs=20, chunk_size=1000):
        self.dims = dims
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.chunk_size = chunk_size
        self.epochs = epochs
        with open(w2v_path, 'rb') as f:
            self.w2v = pickle.load(f)
        self.model = None
        self.n_classes = None
        self.history = None
        self.metrics = None
        self.callbacks = None

    def build_cnn(self):
        if self.n_classes > 2:
            loss = 'categorical_crossentropy'
            activation = 'softmax'
        else:
            loss = 'binary_crossentropy'
            activation = 'sigmoid'

        input_layer = layers.Input(shape=(self.max_seq_len, 300))
        conv1_1 = layers.Conv1D(128, 4, activation='relu', padding='same')(input_layer)
        conv1_2 = layers.Conv1D(128, 5, activation='relu', padding='same')(conv1_1)
        #conv1_3 = layers.Conv1D(128, 5, activation='relu', padding='same')(conv1_2)
        conv_out = layers.Concatenate(axis=1)([conv1_1, conv1_2])

        dropout_rate = 0.5
        dropout_out1 = layers.Dropout(dropout_rate)(conv_out)

        pool_out = layers.MaxPool1D(pool_size=self.max_seq_len, padding='valid')(dropout_out1)
        flatten_out = layers.Flatten()(pool_out)
        dropout_out2 = layers.Dropout(dropout_rate)(flatten_out)
        dense_out = layers.Dense(self.n_classes, activation=activation, kernel_regularizer=regularizers.L2(0.001))(dropout_out2)
        
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        cnn_model = Model(inputs=input_layer, outputs=dense_out)
        cnn_model.compile(optimizer='adam', loss=loss, metrics=self.metrics)
        #cnn_model.summary()
        self.model = cnn_model

    def prepare_dataset(self, df):
        def generator():
            for _, row in df.iterrows():
                label = row[0]
                sentence = row[1]
                x = np.zeros((self.max_seq_len, 300))
                y = np.zeros(self.n_classes)

                if isinstance(sentence, str):
                    words = sentence.split()[:self.max_seq_len]
                    for k, word in enumerate(words):
                        if word in self.w2v:
                            x[k, :] = self.w2v[word]
                y[label] = 1.0
                yield x, y

        dataset = tf.data.Dataset.from_generator(
            generator,
            output_signature=(
                tf.TensorSpec(shape=(self.max_seq_len, 300), dtype=tf.float32),
                tf.TensorSpec(shape=(self.n_classes,), dtype=tf.float32)
            )
        )
        return dataset


    def insert_values(self, train_path, test_path):
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        self.n_classes = train_df['class'].nunique()
        unique_classes = train_df['class'].unique()
        labels_map = dict(zip(unique_classes, range(self.n_classes)))

        train_df['class'] = train_df['class'].map(labels_map)
        test_df['class'] = test_df['class'].map(labels_map)

        train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=100)
        print(f'Train size: {len(train_df)}\nValidation size: {len(val_df)}\nTest size: {len(test_df)}')

        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)
        val_df = val_df.sample(frac=1, random_state=100).reset_index(drop=True)

        train_dataset = self.prepare_dataset(train_df).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
        test_dataset = self.prepare_dataset(test_df).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
        val_dataset = self.prepare_dataset(val_df).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

        return train_dataset, test_dataset, val_dataset, self.n_classes

    def fit(self, train_dataset, val_dataset):
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        self.build_cnn()
        self.history = self.model.fit(train_dataset, epochs=self.epochs, validation_data=val_dataset, callbacks=self.callbacks, verbose=0)
        return self.history

    def evaluate(self, test_dataset):
        return self.model.evaluate(test_dataset, return_dict=True)

    def run_n_times(self, train_dataset, test_dataset, val_dataset, dataset_name, n=3):

        log_dir = f"logs/fit/{dataset_name}/" + datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
        decay_rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto', min_delta=0.0001 ,min_lr=0.00001)
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto', restore_best_weights=True)
        self.callbacks = [tensorboard_callback, decay_rate, early_stopping]

        hist_dict = {}
        res_dict = {}
        best_val_loss = float('inf')
        for i in range(n):
            print(f'Run {i+1} of {n}')
            try:
                self.fit(train_dataset, val_dataset)  # Updated to use train_dataset and val_dataset
            except tf.errors.ResourceExhaustedError:
                K.clear_session()
                self.model = None
                self.build_cnn()
                continue
            res = self.evaluate(test_dataset)  # Updated to use test_dataset
            res_dict[i+1] = res
            if self.history.history['val_loss'][-1] < best_val_loss:
                best_val_loss = self.history.history['val_loss'][-1]
                self.model.save(f"models/{dataset_name}_best_model.h5")
            self.model.set_weights([np.zeros(w.shape) for w in self.model.get_weights()])

        avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4) for metric in res_dict[1].keys()}

        # Save the average results to disk
        os.makedirs("results", exist_ok=True)
        with open(f"results/{dataset_name}_avg_results.txt", "w") as f:
            for key, value in avg_dict.items():
                f.write(f"{key}: {value}\n")

        K.clear_session()

        return hist_dict, res_dict, avg_dict

In [None]:
name = 'cardio'
train_path = f'data/original/{name}/train.csv'
test_path = f'data/original/{name}/test.csv'
w2v_path = 'w2v.pkl'
dataset_name = f'{name}'
max_seq_len = 64
batch_size = 128
epochs = 30

cnn = CNN(dims=300, w2v_path=w2v_path, max_seq_len=max_seq_len, batch_size=batch_size, epochs=epochs, chunk_size=1000)
train_dataset, test_dataset, val_dataset, n_classes = cnn.insert_values(train_path, test_path)  # Updated to return datasets
hist_dict, res_dict, avg_dict = cnn.run_n_times(train_dataset, test_dataset, val_dataset, name, n=3)  # Updated to use datasets

# = = = = =  = = = = = = =
# L  S  T  M
# = = = = = = = = = = = = 

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
import tensorflow.keras.regularizers as regularizers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
import tensorflow_addons as tfa
from datetime import datetime
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import pickle
import os
import json
import random

os.environ['CUDA_VISIBLE_DEVICES'] = ''
np.random.seed(100)
random.seed(100)
tf.random.set_seed(100)


class LSTM:
    def __init__(self, dims, w2v_path, max_seq_len=20, batch_size=128, epochs=20, chunk_size=1000):
        self.dims = dims
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.chunk_size = chunk_size
        self.epochs = epochs
        with open(w2v_path, 'rb') as f:
            self.w2v = pickle.load(f)
        self.model = None
        self.n_classes = None
        self.history = None
        self.metrics = None
        self.callbacks = None

    def build_lstm(self):
        if self.n_classes > 2:
            loss = 'categorical_crossentropy'
            activation = 'softmax'
        else:
            loss = 'binary_crossentropy'
            activation = 'sigmoid'

        input_layer = layers.Input(shape=(self.max_seq_len, 300))
        lstm_1 = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(input_layer)
        dropout_rate = 0.5
        dropout_out1 = layers.Dropout(dropout_rate)(lstm_1)
        lstm_2 = layers.Bidirectional(layers.LSTM(32, return_sequences=False))(dropout_out1)
        dropout_out2 = layers.Dropout(dropout_rate)(lstm_2)
        dense_1 = layers.Dense(20, activation='relu')(dropout_out2)
        dense_out = layers.Dense(self.n_classes, activation=activation, kernel_regularizer=regularizers.L2(0.001))(dense_1)
        
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        lstm_model = Model(inputs=input_layer, outputs=dense_out)
        lstm_model.compile(optimizer='adam', loss=loss, metrics=self.metrics)
        #lstm_model.summary()
        self.model = lstm_model

    def prepare_dataset(self, df):
        def generator():
            for _, row in df.iterrows():
                label = row[0]
                sentence = row[1]
                x = np.zeros((self.max_seq_len, 300))
                y = np.zeros(self.n_classes)

                if isinstance(sentence, str):
                    words = sentence.split()[:self.max_seq_len]
                    for k, word in enumerate(words):
                        if word in self.w2v:
                            x[k, :] = self.w2v[word]
                y[label] = 1.0
                yield x, y

        dataset = tf.data.Dataset.from_generator(
            generator,
            output_signature=(
                tf.TensorSpec(shape=(self.max_seq_len, 300), dtype=tf.float32),
                tf.TensorSpec(shape=(self.n_classes,), dtype=tf.float32)
            )
        )
        return dataset


    def insert_values(self, train_path, test_path):
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        self.n_classes = train_df['class'].nunique()
        unique_classes = train_df['class'].unique()
        labels_map = dict(zip(unique_classes, range(self.n_classes)))

        train_df['class'] = train_df['class'].map(labels_map)
        test_df['class'] = test_df['class'].map(labels_map)

        train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=100)
        print(f'Train size: {len(train_df)}\nValidation size: {len(val_df)}\nTest size: {len(test_df)}')

        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)
        val_df = val_df.sample(frac=1, random_state=100).reset_index(drop=True)

        train_dataset = self.prepare_dataset(train_df).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
        test_dataset = self.prepare_dataset(test_df).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
        val_dataset = self.prepare_dataset(val_df).batch(self.batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

        return train_dataset, test_dataset, val_dataset, self.n_classes

    def fit(self, train_dataset, val_dataset):
        self.metrics = [tf.keras.metrics.AUC(name='auc'), tfa.metrics.F1Score(self.n_classes, average='weighted', name='f1_score'), 'accuracy']
        self.build_lstm()
        self.history = self.model.fit(train_dataset, epochs=self.epochs, validation_data=val_dataset, callbacks=self.callbacks, verbose=0)
        return self.history

    def evaluate(self, test_dataset):
        return self.model.evaluate(test_dataset, return_dict=True)

    def run_n_times(self, train_dataset, test_dataset, val_dataset, dataset_name, n=3):

        log_dir = f"logs/fit/{dataset_name}/" + datetime.now().strftime("%Y%m%d-%H%M%S")
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
        decay_rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1, mode='auto', min_delta=0.0001 ,min_lr=0.00001)
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto', restore_best_weights=True)
        self.callbacks = [tensorboard_callback, decay_rate, early_stopping]

        hist_dict = {}
        res_dict = {}
        best_val_loss = float('inf')
        for i in range(n):
            print(f'Run {i+1} of {n}')
            try:
                self.fit(train_dataset, val_dataset)  # Updated to use train_dataset and val_dataset
            except tf.errors.ResourceExhaustedError:
                K.clear_session()
                self.model = None
                self.build_lstm()
                continue
            res = self.evaluate(test_dataset)  # Updated to use test_dataset
            res_dict[i+1] = res
            if self.history.history['val_loss'][-1] < best_val_loss:
                best_val_loss = self.history.history['val_loss'][-1]
                self.model.save(f"models/lstm/{dataset_name}_best_model.h5")
            self.model.set_weights([np.zeros(w.shape) for w in self.model.get_weights()])

        avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4) for metric in res_dict[1].keys()}

        # Save the average results to disk
        os.makedirs("results/lstm", exist_ok=True)
        with open(f"results/lstm/{dataset_name}_avg_results.txt", "w") as f:
            for key, value in avg_dict.items():
                f.write(f"{key}: {value}\n")

        K.clear_session()

        return hist_dict, res_dict, avg_dict


In [None]:
name = 'cr'
train_path = f'data/original/{name}/train.csv'
test_path = f'data/original/{name}/test.csv'
w2v_path = 'w2v.pkl'
dataset_name = f'{name}'
max_seq_len = 128
batch_size = 128
epochs = 30

lstm = LSTM(dims=300, w2v_path=w2v_path, max_seq_len=max_seq_len, batch_size=batch_size, epochs=epochs, chunk_size=1000)
train_dataset, test_dataset, val_dataset, n_classes = lstm.insert_values(train_path, test_path)  # Updated to return datasets
hist_dict, res_dict, avg_dict = lstm.run_n_times(train_dataset, test_dataset, val_dataset, name, n=3)  # Updated to use datasets

# B   E   R   T
# ===========================================================

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
import os
# disable wandb
os.environ["WANDB_DISABLED"] = "true"

train_path = 'data/original/cr/train.csv'
test_path = 'data/original/cr/test.csv'
# Load data
train_data = pd.read_csv(train_path).sample(frac=1).reset_index(drop=True)
test_data = pd.read_csv(test_path)

encoder = LabelEncoder()
train_data['class'] = encoder.fit_transform(train_data['class'])
test_data['class'] = encoder.transform(test_data['class'])
# Remove rows with missing or invalid 'text' values
train_data = train_data[train_data['text'].apply(lambda x: isinstance(x, str))]
test_data = test_data[test_data['text'].apply(lambda x: isinstance(x, str))]
    
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_data['class'].unique()))

# Tokenize data
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True,max_length=512)

# Create dataset class
class SimpleDataset:
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SimpleDataset(train_encodings, train_data['class'].tolist())
test_dataset = SimpleDataset(test_encodings, test_data['class'].tolist())

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    probs = softmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)
    
    # Calculate AUC
    if len(np.unique(labels)) > 2:  # Multi-class case
        auc = roc_auc_score(labels, probs, multi_class="ovo", average="weighted")
    else:  # Binary case
        auc = roc_auc_score(labels, probs[:, 1])  # Use the probability of the positive class

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "auc": auc
    }
# Training and evaluation
training_args = TrainingArguments(
    output_dir='./results/bert',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    eval_steps=100,
    logging_steps=10,
    save_steps=0,
    logging_dir='./logs/bert',
    learning_rate=2e-5,
    #fp16=True,
   gradient_accumulation_steps = 8

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)


In [None]:
res = trainer.predict(test_dataset)

In [None]:
import numpy as np
pred = np.argmax(res.predictions, axis=1)

In [None]:
def calculate_accuracy(y_predict, y_ground_truth):
    assert len(y_predict) == len(y_ground_truth), "Both lists must have the same length."

    correct_predictions = 0

    for pred, gt in zip(y_predict, y_ground_truth):
        if pred == gt:
            correct_predictions += 1

    accuracy = correct_predictions / len(y_ground_truth) * 100
    return accuracy

In [None]:
y_ground_truth = test_data['class'].tolist()
y_predict = pred
accuracy = calculate_accuracy(pred, y_ground_truth)
print(f"Accuracy: {accuracy:.2f}%")

In [None]:
res = trainer.evaluate()


In [None]:
[i.split('_')[-1] for idx,i in enumerate(list(res.keys())) if idx < 6]

In [None]:
res

In [None]:
new_dict = {}
order = ['loss', 'auc', 'f1','accuracy']
for i in res:
    if i.split('_')[-1] in order:
        key = i.split('_')[-1]
        new_dict[key] = res[i]#.__format__('0.4f')
df = pd.DataFrame(new_dict, index=[0])

In [None]:
new_dict

In [None]:
df

In [None]:
res_dict = {}
avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4) for metric in res_dict[1].keys()}
# Save the average results to disk
import os
os.makedirs("results/bert", exist_ok=True)
with open(f"results/bert/{dataset_name}_avg_results.txt", "w") as f:
    for key, value in avg_dict.items():
        f.write(f"{key}: {value}\n")

In [None]:
train_data = pd.read_csv(train_path).sample(frac=1).reset_index(drop=True)

In [None]:
train_data['class'].nunique(), len(train_data['class'].unique())

## =========Bert V2 =========

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast ,DistilBertForSequenceClassification# AdamW
from torch.optim import AdamW
import torch
from numpy import mean
from torch import nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [None]:
train_path = 'data/original/cr/train.csv'
test_path = 'data/original/cr/test.csv'
# Load data
train_data = pd.read_csv(train_path).sample(frac=1).reset_index(drop=True)
test_data = pd.read_csv(test_path).sample(frac=1).reset_index(drop=True)
train_texts = train_data['text'].tolist()
train_labels = train_data['class'].tolist()
test_texts = test_data['text'].tolist()
test_labels = test_data['class'].tolist()

print(f"Number of training examples: {len(train_texts)}")
print(f"Number of test examples: {len(test_texts)}")

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

In [None]:
# parameters
num_of_epochs = 3
learning_rate = 10e-6

In [None]:
train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
# model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

In [None]:
#optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

In [None]:
def train(data_loader, model, optimizer):
    model.train()
    epoch_loss = 0
    size = len(data_loader.dataset)	
    for i,batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        optimizer.zero_grad()
        loss = outputs.loss
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Train loss: {epoch_loss/size:.4f}")
    

In [None]:
def test(data_loader, model):
    model.eval()
    size = len(data_loader.dataset)
    test_loss, accuracy = 0, 0
    with torch.no_grad():
        for batch in data_loader:
            X, y = batch['input_ids'].to(device), batch['labels'].to(device)
            pred = model(X,labels=y)
            test_loss += pred.loss
            accuracy += (pred.logits.softmax(1).argmax(1) == y).type(torch.float).sum().item()
        test_loss /= size
        accuracy /= size
        print(f"Test loss: \n Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [None]:
tqdm.write("Training the model...")
tqdm.pandas()
for i in tqdm(range(num_of_epochs)):
    print(f'Epoch {i+1}')
    train(train_loader, model, optimizer)
    test(test_loader, model)

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
os.environ["WANDB_DISABLED"] = "true"

train_path = 'data/original/cr/train.csv'
test_path = 'data/original/cr/test.csv'
# Load data
train_data = pd.read_csv(train_path).sample(frac=1).reset_index(drop=True)
test_data = pd.read_csv(test_path).sample(frac=1).reset_index(drop=True)
train_texts = train_data['text'].tolist()
train_labels = train_data['class'].tolist()
test_texts = test_data['text'].tolist()
test_labels = test_data['class'].tolist()

print(f"Number of training examples: {len(train_texts)}")
print(f"Number of test examples: {len(test_texts)}")

# Tokenize data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create PyTorch Dataset
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Define model and trainer
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(train_labels)))
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)


# ======== BERT Combined ========

In [None]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax
import torch

os.environ["WANDB_DISABLED"] = "true"

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class TextClassifier:
    def __init__(self, model_name, train_path, test_path, training_args):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.train_dataset, self.test_dataset, self.n_classes = self.prepare_dataset(train_path, test_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.n_classes)
        self.training_args = training_args
        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.test_dataset,
            compute_metrics=self.compute_metrics
        )

    def prepare_dataset(self, train_path, test_path):
        train_data = pd.read_csv(train_path).sample(frac=1).reset_index(drop=True)
        test_data = pd.read_csv(test_path).sample(frac=1).reset_index(drop=True)
        encoder = LabelEncoder()
        train_data['class'] = encoder.fit_transform(train_data['class'])
        test_data['class'] = encoder.transform(test_data['class'])

        train_encodings = self.tokenizer(train_data['text'].tolist(), truncation=True, padding=True)
        test_encodings = self.tokenizer(test_data['text'].tolist(), truncation=True, padding=True)

        train_dataset = TextDataset(train_encodings, train_data['class'].tolist())
        test_dataset = TextDataset(test_encodings, test_data['class'].tolist())
        n_classes = len(train_data['class'].unique())

        return train_dataset, test_dataset, n_classes

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        probs = softmax(pred.predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
        accuracy = accuracy_score(labels, preds)
        auc = roc_auc_score(labels, probs, multi_class="ovo", average="weighted") if self.n_classes > 2 else roc_auc_score(labels, probs[:, 1])
        return {
            "accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "auc": auc
        }

    def train_and_evaluate(self):
        self.trainer.train()
        eval_results = self.trainer.evaluate()
        return eval_results

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

train_path = 'data/original/cr/train.csv'
test_path = 'data/original/cr/test.csv'

classifier = TextClassifier(
    model_name='distilbert-base-uncased',
    train_path=train_path,
    test_path=test_path,
    training_args=training_args
)

eval_results = classifier.train_and_evaluate()
print(eval_results)



In [13]:
import pandas as pd
import numpy as np
import os
import shutil
from sklearn.preprocessing import LabelEncoder
from scipy.special import softmax
from sklearn.model_selection import train_test_split
#from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

#set seed to 100 
np.random.seed(100)



# disable wandb
os.environ["WANDB_DISABLED"] = "true"

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
class MyTrainer(Trainer):
    def get_train_dataloader(self):
        return self.train_dataloader

    def get_eval_dataloader(self, eval_dataset=None):
        return self.eval_dataloader

class BERT:
    def __init__(self, train_path, test_path, trainings_arguments: TrainingArguments, model_name='distilbert-base-uncased'):
        # Define collate_fn
        def collate_fn(batch):
            keys = batch[0].keys()
            output_batch = {}
            for key in keys:
                items = [item[key] for item in batch]
                if isinstance(items[0], torch.Tensor):
                    output_batch[key] = torch.stack(items)
                else:
                    output_batch[key] = torch.tensor(items)
            return output_batch


        
        self.model_name = model_name
        self.n_runs = None
        self.best_val_loss = float('inf')
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.trainings_arguments = trainings_arguments        
        self.train_dataset, self.val_dataset, self.test_dataset, self.n_classes = self.prepare_dataset(train_path, test_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.n_classes)
        self.compute_metrics_func = self.compute_metrics
        
        # Create the DataLoaders
        self.train_dataloader = DataLoader(self.train_dataset, batch_size=self.trainings_arguments.per_device_train_batch_size, shuffle=True, collate_fn=collate_fn)
        self.val_dataloader = DataLoader(self.val_dataset, batch_size=self.trainings_arguments.per_device_eval_batch_size, shuffle=False, collate_fn=collate_fn)
        self.test_dataloader = DataLoader(self.test_dataset, batch_size=self.trainings_arguments.per_device_eval_batch_size, shuffle=False, collate_fn=collate_fn)
        
        self.trainer = MyTrainer(
            model=self.model,
            args=self.trainings_arguments,
            compute_metrics=self.compute_metrics_func)
        
        self.trainer.train_dataloader = self.train_dataloader
        self.trainer.eval_dataloader = self.val_dataloader

    
   

    def prepare_dataset(self, train_path, test_path):
        train_data = pd.read_csv(train_path).sample(frac=1).reset_index(drop=True)
        test_data = pd.read_csv(test_path).sample(frac=1).reset_index(drop=True)
        n_classes = len(train_data['class'].unique())        
        # encode the labels
        encoder = LabelEncoder()
        train_data['class'] = encoder.fit_transform(train_data['class'])
        test_data['class'] = encoder.transform(test_data['class'])
        # Remove rows with missing or invalid 'text' values
        train_data = train_data[train_data['text'].apply(lambda x: isinstance(x, str))]
        test_data = test_data[test_data['text'].apply(lambda x: isinstance(x, str))]        
        
        # Split train_data into train and validation sets
        train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42, stratify=train_data['class'])

        # tokenize the text
        train_encodings = self.tokenizer(train_data['text'].tolist(), truncation=True, padding=True)
        val_encodings = self.tokenizer(val_data['text'].tolist(), truncation=True, padding=True)
        test_encodings = self.tokenizer(test_data['text'].tolist(), truncation=True, padding=True)
        # create dataset
        train_dataset = CustomDataset(train_encodings, train_data['class'].tolist())
        val_dataset = CustomDataset(val_encodings, val_data['class'].tolist())
        test_dataset = CustomDataset(test_encodings, test_data['class'].tolist())
        return train_dataset, val_dataset, test_dataset, n_classes


    def compute_metrics(self,pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        probs = softmax(pred.predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
        accuracy = accuracy_score(labels, preds)
        
        # Calculate AUC
        if len(np.unique(labels)) > 2:  # Multi-class case
            auc = roc_auc_score(labels, probs, multi_class="ovo", average="weighted")
        else:  # Binary case
            auc = roc_auc_score(labels, probs[:, 1])  # Use the probability of the positive class

        return {
            "accuracy": accuracy,
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "auc": auc
        }
    
    def train_and_evaluate(self, run_idx, dataset_name):
        print(f'Run {run_idx} of {self.n_runs}')
        # Load initial model state
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.n_classes)
        self.trainer.model = self.model
        self.trainer.train()
        self.trainer.save_model(os.path.join("models", "bert", f"{dataset_name}_run_{run_idx}_model"))
        results = self.trainer.evaluate()
        if results['eval_loss'] < self.best_val_loss:
            self.best_val_loss = results['eval_loss']
            self.trainer.save_model(os.path.join("models", "bert", f"{dataset_name}_best_model"))
            
        return results


    def clean_up_models(self, dataset_name):
        for i in range(self.n_runs):
            shutil.rmtree(f"models/bert/{dataset_name}_run_{i+1}_model")

    def calculate_and_save_averages(self, res_dict, dataset_name):
        avg_dict = {metric: round(sum(values[metric] for values in res_dict.values()) / len(res_dict), 4) for metric in res_dict[1].keys()}
        order = ['loss', 'auc', 'f1', 'accuracy']
        filtered_metrics = {i: avg_dict[f"eval_{i}"] for i in order if f"eval_{i}" in avg_dict}
        os.makedirs("results/bert", exist_ok=True)
        with open(f"results/bert/{dataset_name}_avg_results.txt", "w") as f:
            for key, value in filtered_metrics.items():
                f.write(f"{key}: {value}\n")
        return filtered_metrics


    def run_n_times(self, dataset_name, n=3):   
        self.n_runs = n
        self.best_val_loss = float('inf')
        res_dict = {}

        for i in range(n):
            res_dict[i+1] = self.train_and_evaluate(i+1, dataset_name)
            
        self.clean_up_models(dataset_name)
        avg_metrics = self.calculate_and_save_averages(res_dict, dataset_name)

        return avg_metrics
            





In [None]:
from transformers import TrainingArguments

#dataset_list = ['trec','agnews', 'pc', 'yelp', 'cr', 'kaggle_med', 'cardio', 'bbc', 'sst2','subj']
dataset_list = ['cr']


for name in dataset_list:
    try:
        print(f'Running {name} dataset')
        train_path  = f'data/original/{name}/train.csv'
        test_path   = f'data/original/{name}/test.csv'
        model_name = 'distilbert-base-uncased'
        training_args = TrainingArguments(
            output_dir='./models/bert',
            num_train_epochs=3,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            evaluation_strategy="epoch",
            # eval_steps=100,
            logging_steps=10,
            # save_steps=0,
            logging_dir='./logs/bert',
            metric_for_best_model="f1",
            #learning_rate=2e-5,
            seed=100
        )



        bert = BERT(train_path, test_path, training_args, model_name=model_name)            
        avg_dict = bert.run_n_times(name, n=3)
        print('---------------------------------------------------')
        print(f'Average results for {name} dataset')
        print(avg_dict)
        print('---------------------------------------------------')
    except Exception as e:
        print(f'Error in {name}')
        print(str(e))
        continue


In [16]:
train_path  = f'data/original/cr/train.csv'
test_path   = f'data/original/cr/test.csv'
model_name = 'distilbert-base-uncased'
training_args = TrainingArguments(
    output_dir='./models/bert',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    # eval_steps=100,
    logging_steps=10,
    # save_steps=0,
    logging_dir='./logs/bert',
    metric_for_best_model="f1",
    learning_rate=2e-5,
    seed=100
    )

bert = BERT(train_path, test_path, training_args, model_name=model_name)
# avg_dict = bert.run_n_times(name, n=3)
# print('---------------------------------------------------')
# print(f'Average results for {name} dataset')
# print(avg_dict)
# print('---------------------------------------------------')
res = bert.train_and_evaluate(1, 'cr')
print(res)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file config.json from cache at /home/peyman/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "dis

Run 1 of None


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 407
  Batch size = 8
  Num examples = 407
  Batch size = 8
Saving model checkpoint to ./models/bert/checkpoint-500
Configuration saved in ./models/bert/checkpoint-500/config.json
Model weights saved in ./models/bert/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 407
  Batch size = 8
Saving model checkpoint to ./models/bert/checkpoint-1000
Configuration saved in ./models/bert/checkpoint-1000/config.json
Model weights saved in ./models/bert/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 407
  Batch size = 8
Saving model checkpoint to models/bert/cr_run_1_model
Configuration saved in models/bert/cr_run_1_model/config.json
Model weights saved in models/bert/cr_run_1_model/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 407
  Batch size = 8


Saving model checkpoint to models/bert/cr_best_model
Configuration saved in models/bert/cr_best_model/config.json
Model weights saved in models/bert/cr_best_model/pytorch_model.bin


{'eval_loss': 0.7069481611251831, 'eval_accuracy': 0.36117936117936117, 'eval_f1': 0.20340383026950193, 'eval_precision': 0.7733217088055798, 'eval_recall': 0.36117936117936117, 'eval_auc': 0.5083439287984742, 'eval_runtime': 7.0254, 'eval_samples_per_second': 57.933, 'eval_steps_per_second': 7.259, 'epoch': 3.0}
