# USAirline

**LOAD + Handling Imbalanced Datasets + training-validation of proposed model**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Input, GlobalAveragePooling1D, Dense, Dropout, LayerNormalization, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
dataset = pd.read_csv("/kaggle/input/sentiments-processed/USAirline_dataset_processed.csv")
dataset = dataset.dropna()

# Split dataset
X = dataset['review_P']
y = dataset['sentiment']

# Prepare labels for oversampling check
y_dummies = pd.get_dummies(y)
class_counts = y_dummies.sum()
max_class_count = class_counts.max()

# Oversampling minority classes
for class_name, class_count in class_counts.iteritems():
    samples_to_add = max_class_count - class_count
    samples = dataset[dataset['sentiment'] == class_name].sample(n=samples_to_add, replace=True)
    dataset = pd.concat([dataset, samples])

# Ensure the dataset is still shuffled after oversampling
dataset = dataset.sample(frac=1).reset_index(drop=True)

# Now split your dataset
X_train, X_temp, y_train, y_temp = train_test_split(dataset['review_P'], dataset['sentiment'], test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

# Prepare labels again after balancing
y_train = pd.get_dummies(y_train).values
y_valid = pd.get_dummies(y_val).values
y_test = pd.get_dummies(y_test).values

# Rest of your code remains the same
# Configuration
transformer_model = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)
seq_len = 512
batch_size = 16

# Tokenization
def tokenize_texts(texts):
    return tokenizer(texts.tolist(), max_length=seq_len, truncation=True, padding='max_length', return_tensors='tf')

tokenized_inputs_train = tokenize_texts(X_train)
tokenized_inputs_valid = tokenize_texts(X_val)
tokenized_inputs_test = tokenize_texts(X_test)
####################################################################################################################

# Assuming transformer_model and seq_len have been defined
encoder = TFAutoModel.from_pretrained(transformer_model)
input_ids = Input(shape=(seq_len,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(seq_len,), dtype=tf.int32, name="attention_mask")
embeddings = encoder(input_ids, attention_mask=attention_mask)[0]
####################################################################################################################
#Adding Layers with distilroberta model 
# Correctly using layer variables
bi_lstm_1 = Bidirectional(LSTM(256, return_sequences=True))(embeddings)
dropout_1 = Dropout(0.1)(bi_lstm_1)
layer_norm_1 = LayerNormalization()(dropout_1)
bi_lstm_2 = Bidirectional(LSTM(128, return_sequences=True))(layer_norm_1)
dropout_2 = Dropout(0.2)(bi_lstm_2)
pooled_output = GlobalAveragePooling1D()(dropout_2)
dense_1 = Dense(256, activation="relu")(pooled_output)
dropout_3 = Dropout(0.4)(dense_1)
outputs = Dense(y_train.shape[1], activation='softmax')(dropout_3)  # Connecting the output through the intended final dropout layer

model = Model(inputs=[input_ids, attention_mask], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=1e-5), loss=CategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
model.summary()
####################################################################################################################
# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_train['input_ids'], 'attention_mask': tokenized_inputs_train['attention_mask']}, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_valid['input_ids'], 'attention_mask': tokenized_inputs_valid['attention_mask']}, y_valid))

# EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=4,
    verbose=1,
    restore_best_weights=True
)

# Training with EarlyStopping
history = model.fit(
    train_dataset.shuffle(10000).batch(batch_size),
    validation_data=valid_dataset.batch(batch_size),
    epochs=30,
    verbose=1,
    callbacks=[early_stopping]
)

**Evaluate performance using Testing dataset**

In [None]:
# Prepare the test dataset
test_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_test['input_ids'], 
                                                    'attention_mask': tokenized_inputs_test['attention_mask']}, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset.batch(batch_size), verbose=1)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

**Accuracy and Loss Curve**

data generated from model.fit

In [None]:
import matplotlib.pyplot as plt

# New manually extracted accuracy and loss data during training time
#data generated from model.fit
train_accuracy = [0.8945, 0.9584, 0.9761, 0.9828, 0.9871, 0.9894, 0.9904, 0.9913, 0.9923, 0.9926, 0.9934, 0.9942]
val_accuracy = [0.9523, 0.9659, 0.9751, 0.9816, 0.9833, 0.9831, 0.9850, 0.9852, 0.9845, 0.9840, 0.9847, 0.9833]
train_loss = [0.2690, 0.1210, 0.0756, 0.0565, 0.0439, 0.0349, 0.0305, 0.0270, 0.0224, 0.0222, 0.0203, 0.0171]
val_loss = [0.1351, 0.1077, 0.0856, 0.0664, 0.0656, 0.0668, 0.0638, 0.0610, 0.0662, 0.0703, 0.0677, 0.0797]

# Identify the epochs of best accuracy and lowest loss
best_acc_epoch = val_accuracy.index(max(val_accuracy))  # 0-based indexing
lowest_loss_epoch = val_loss.index(min(val_loss))  # 0-based indexing

# Create figure for plotting
plt.figure(figsize=(14, 6))

# Plot training & validation accuracy with highlights
plt.subplot(1, 2, 1)
plt.plot(train_accuracy, 'bo--', label='Training Accuracy')
plt.plot(val_accuracy, 'ro--', label='Validation Accuracy')
plt.plot(best_acc_epoch, max(val_accuracy), 'ks', markersize=10, label='Best Validation Accuracy')
plt.text(best_acc_epoch, max(val_accuracy), f'  Epoch {best_acc_epoch+1}\n  {max(val_accuracy):.4f}', verticalalignment='bottom')
#plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')

# Plot training & validation loss with highlights
plt.subplot(1, 2, 2)
plt.plot(train_loss, 'bo--', label='Training Loss')
plt.plot(val_loss, 'ro--', label='Validation Loss')
plt.plot(lowest_loss_epoch, min(val_loss), 'ks', markersize=10, label='Lowest Validation Loss')
plt.text(lowest_loss_epoch, min(val_loss), f'  Epoch {lowest_loss_epoch+1}\n  {min(val_loss):.4f}', verticalalignment='top')
#plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/kaggle/working/Final_USAirline_curve_98.5.png', dpi=100)

# Show the plot
plt.show()

In [None]:
# Predictions
preds = model.predict(test_dataset.batch(batch_size))
y_pred = np.argmax(preds, axis=1)
y_true = np.argmax(y_test, axis=1)

**confusion matrix**

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.tight_layout()
plt.savefig('/kaggle/working/Final_USAirline_Confusion_Matrix.png', dpi=100)

# Show the plot
plt.show()

**Classification report**

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Calculate and print the classification report
print(classification_report(y_true, y_pred, digits = 4))

**Compute ROC curve and ROC area for each class**

In [None]:
# from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np

# First, ensure y_true is binarized for multiclass labels
n_classes = len(np.unique(y_true))
y_true_binarized = label_binarize(y_true, classes=np.arange(n_classes))

# Compute ROC curve and ROC area for each class
plt.figure()
lw = 2

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_true_binarized[:, i], preds[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc))

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for Multi-Class')
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig('/kaggle/working/USAirline_ROC_CURVE.png', dpi=100)
plt.show()

# IMDB

**Load + training-validation of proposed model**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Input, GlobalAveragePooling1D, Dense, Dropout, LayerNormalization, LSTM, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
dataset = pd.read_csv("/kaggle/input/sentiments-processed/imdb_dataset_processed.csv")
dataset = dataset.dropna()

# Now split your dataset
X_train, X_temp, y_train, y_temp = train_test_split(dataset['review_P'], dataset['sentiment'], test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Prepare labels again after balancing
y_train = pd.get_dummies(y_train).values
y_valid = pd.get_dummies(y_val).values
y_test = pd.get_dummies(y_test).values

# Rest of your code remains the same
# Configuration
transformer_model = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_model)
seq_len = 512
batch_size = 16

# Tokenization
def tokenize_texts(texts):
    return tokenizer(texts.tolist(), max_length=seq_len, truncation=True, padding='max_length', return_tensors='tf')

tokenized_inputs_train = tokenize_texts(X_train)
tokenized_inputs_valid = tokenize_texts(X_val)
tokenized_inputs_test = tokenize_texts(X_test)

####################################################################################################################

# Assuming transformer_model and seq_len have been defined
encoder = TFAutoModel.from_pretrained(transformer_model)
input_ids = Input(shape=(seq_len,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(seq_len,), dtype=tf.int32, name="attention_mask")
embeddings = encoder(input_ids, attention_mask=attention_mask)[0]

####################################################################################################################

#Adding Layers with distilroberta model 
# Correctly using layer variables
bi_lstm_1 = Bidirectional(LSTM(256, return_sequences=True))(embeddings)
dropout_1 = Dropout(0.1)(bi_lstm_1)
layer_norm_1 = LayerNormalization()(dropout_1)
bi_lstm_2 = Bidirectional(LSTM(128, return_sequences=True))(layer_norm_1)
dropout_2 = Dropout(0.2)(bi_lstm_2)
pooled_output = GlobalAveragePooling1D()(dropout_2)
dense_1 = Dense(256, activation="relu")(pooled_output)
dropout_3 = Dropout(0.4)(dense_1)
outputs = Dense(y_train.shape[1], activation='softmax')(dropout_3)  # Connecting the output through the intended final dropout layer

model = Model(inputs=[input_ids, attention_mask], outputs=outputs)
model.compile(optimizer=Adam(learning_rate=1e-5), loss=CategoricalCrossentropy(from_logits=False), metrics=['accuracy'])
model.summary()
####################################################################################################################
# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_train['input_ids'], 'attention_mask': tokenized_inputs_train['attention_mask']}, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_valid['input_ids'], 'attention_mask': tokenized_inputs_valid['attention_mask']}, y_valid))

# EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=4,
    verbose=1,
    restore_best_weights=True
)

# Training with EarlyStopping
history = model.fit(
    train_dataset.shuffle(10000).batch(batch_size),
    validation_data=valid_dataset.batch(batch_size),
    epochs=30,
    verbose=1,
    callbacks=[early_stopping]
)

**Evaluate performance using Testing dataset**

In [None]:
# Prepare the test dataset
test_dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': tokenized_inputs_test['input_ids'], 
                                                    'attention_mask': tokenized_inputs_test['attention_mask']}, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset.batch(batch_size), verbose=1)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

**Accuracy and Loss Curve**

data generated from model.fit

In [None]:
import matplotlib.pyplot as plt

# New manually extracted accuracy and loss data during training time
#data generated from model.fit
train_accuracy_updated = [0.9044, 0.9416, 0.9577, 0.9717, 0.9802, 0.9850, 0.9891]
val_accuracy_updated = [0.9336, 0.9355, 0.9416, 0.9353, 0.9376, 0.9363, 0.9364]
train_loss_updated = [0.2363, 0.1584, 0.1187, 0.0837, 0.0600, 0.0460, 0.0348]
val_loss_updated = [0.1779, 0.1709, 0.1796, 0.2104, 0.2209, 0.2361, 0.2665]

# Identify the epochs of best accuracy and lowest loss
best_acc_epoch_updated = val_accuracy_updated.index(max(val_accuracy_updated))  # 0-based indexing
lowest_loss_epoch_updated = val_loss_updated.index(min(val_loss_updated))  # 0-based indexing

# Create figure for plotting updated values
plt.figure(figsize=(14, 6))

# Plot updated training & validation accuracy with highlights
plt.subplot(1, 2, 1)
plt.plot(train_accuracy_updated, 'bo--', label='Training Accuracy')
plt.plot(val_accuracy_updated, 'ro--', label='Validation Accuracy')
plt.plot(best_acc_epoch_updated, max(val_accuracy_updated), 'ks', markersize=10, label='Best Validation Accuracy')
plt.text(best_acc_epoch_updated, max(val_accuracy_updated), f'  Epoch {best_acc_epoch_updated+1}\n  {max(val_accuracy_updated):.4f}', verticalalignment='bottom')
#plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')

# Plot updated training & validation loss with highlights
plt.subplot(1, 2, 2)
plt.plot(train_loss_updated, 'bo--', label='Training Loss')
plt.plot(val_loss_updated, 'ro--', label='Validation Loss')
plt.plot(lowest_loss_epoch_updated, min(val_loss_updated), 'ks', markersize=10, label='Lowest Validation Loss')
plt.text(lowest_loss_epoch_updated, min(val_loss_updated), f'  Epoch {lowest_loss_epoch_updated+1}\n  {min(val_loss_updated):.4f}', verticalalignment='top')
#plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/kaggle/working/imdb_acc_loss_curve_final.png', dpi=100)

plt.show()


In [None]:
# Predictions
preds = model.predict(test_dataset.batch(batch_size))
y_pred = np.argmax(preds, axis=1)
y_true = np.argmax(y_test, axis=1)

confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')

plt.tight_layout()
plt.savefig('/kaggle/working/imdb_Confusion_Matrix.png', dpi=100)

# Show the plot
plt.show()

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Calculate and print the classification report
print(classification_report(y_true, y_pred, digits = 4))