In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Load data
data = pd.read_csv('updated_dataset.csv')

data['text'] = data['text'].astype(str).apply(lambda x: word_tokenize(x.lower()))

# Encoding labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split data
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# Vocabulary building
vocab = set(word for sentence in data['text'] for word in sentence)
word_to_ix = {word: i+1 for i, word in enumerate(vocab)}
word_to_ix['<unk>'] = len(word_to_ix) + 1
word_to_ix['<pad>'] = 0

def prepare_sequence(seq, to_ix):
    return torch.tensor([to_ix.get(word, to_ix['<unk>']) for word in seq], dtype=torch.long)

# Dataset
class TextDataset(Dataset):
    def __init__(self, dataframe, word_to_ix):
        self.dataframe = dataframe
        self.word_to_ix = word_to_ix

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = prepare_sequence(self.dataframe.iloc[idx]['text'], self.word_to_ix)
        label = self.dataframe.iloc[idx]['label']
        return text, label

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=word_to_ix['<pad>'])
    return texts, torch.tensor(labels)

train_dataset = TextDataset(train_data, word_to_ix)
test_dataset = TextDataset(test_data, word_to_ix)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\98993\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import torch.nn as nn
import torch.nn.functional as F

class EnhancedLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=4, bidirectional=True, dropout_rate=0.3):
        super(EnhancedLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=word_to_ix['<pad>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                            dropout=dropout_rate, bidirectional=bidirectional, batch_first=True)
        
        # Enhanced multi-layer attention mechanism
        self.attention1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim)
        self.attention2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.attention3 = nn.Linear(hidden_dim // 2, 1)
        
        # More dense layers for deeper feature extraction
        self.fc1 = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, hidden_dim * 4)
        self.fc2 = nn.Linear(hidden_dim * 4, hidden_dim * 2)
        self.fc3 = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = nn.ReLU()

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        
        # Multi-step attention
        attention = F.relu(self.attention1(lstm_out))
        attention = F.relu(self.attention2(attention))
        attention_weights = F.softmax(self.attention3(attention), dim=1)
        attended = torch.sum(lstm_out * attention_weights.expand_as(lstm_out), dim=1)
        
        # Deep fully connected network
        out = self.dropout(self.activation(self.fc1(attended)))
        out = self.dropout(self.activation(self.fc2(out)))
        out = self.fc3(out)
        return out
# Model instantiation
vocab_size = len(word_to_ix)  # Example vocabulary size
# embedding_dim = 128
# hidden_dim = 256
output_dim = len(label_encoder.classes_)  # Number of classes
# num_layers = 2
bidirectional = True
# dropout_rate = 0.3

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate and move to device
# model = EnhancedLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)


In [3]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import TPESampler

def objective(trial):
    # Hyperparameters to tune
    num_layers = trial.suggest_int('num_layers', 1, 5)
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 312])
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.3)
    lr = trial.suggest_loguniform('lr', 1e-3, 1e-1)
    embedding_dim = trial.suggest_int('embedding_dim', 128, 238)
    epochs = trial.suggest_int('epochs',1,7)

    # Define the model
    model = EnhancedLSTMModel(vocab_size=len(word_to_ix), embedding_dim=embedding_dim, hidden_dim=hidden_dim,
                                  output_dim=len(label_encoder.classes_), num_layers=num_layers, bidirectional=True,
                                  dropout_rate=dropout_rate).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Train the model
    for epoch in range(epochs):  # Reduced number of epochs for each trial
        model.train()
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)

    return accuracy


In [4]:
study = optuna.create_study(sampler = TPESampler(), 
                            pruner = SuccessiveHalvingPruner(),
                            direction= "maximize")
study.optimize(objective, n_trials=150)

trial = study.best_trial
print("Best Score: ", trial.value)
print("Best Params: ")
for key, value in trial.params.items():
    print("  {}: {}".format(key, value))

# Fetch the best model
best_params = study.best_trial.params
optuna.visualization.plot_param_importances(study)

[I 2024-05-12 02:33:05,053] A new study created in memory with name: no-name-67610bc2-9b58-4fea-9237-c16704092f2b
  lr = trial.suggest_loguniform('lr', 1e-3, 1e-1)
[I 2024-05-12 02:34:23,847] Trial 0 finished with value: 0.7784421460892049 and parameters: {'num_layers': 3, 'hidden_dim': 256, 'dropout_rate': 0.24457604795740787, 'lr': 0.0013430949361229742, 'embedding_dim': 170, 'epochs': 6}. Best is trial 0 with value: 0.7784421460892049.
  lr = trial.suggest_loguniform('lr', 1e-3, 1e-1)
[I 2024-05-12 02:34:38,345] Trial 1 finished with value: 0.7947640594699418 and parameters: {'num_layers': 2, 'hidden_dim': 312, 'dropout_rate': 0.2560116986142408, 'lr': 0.0012866516605245146, 'embedding_dim': 198, 'epochs': 1}. Best is trial 1 with value: 0.7947640594699418.
  lr = trial.suggest_loguniform('lr', 1e-3, 1e-1)
[I 2024-05-12 02:35:18,487] Trial 2 finished with value: 0.8165804783451842 and parameters: {'num_layers': 2, 'hidden_dim': 256, 'dropout_rate': 0.1899015805280937, 'lr': 0.001334

Best Score:  0.8267614738202973
Best Params: 
  num_layers: 1
  hidden_dim: 128
  dropout_rate: 0.27589754228613206
  lr: 0.003929971261903612
  embedding_dim: 178
  epochs: 2


ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [16]:
best_model = EnhancedLSTMModel(vocab_size=len(word_to_ix), embedding_dim=best_params['embedding_dim'],
                                   hidden_dim=best_params['hidden_dim'], output_dim=len(label_encoder.classes_),
                                   num_layers=best_params['num_layers'], bidirectional=True,
                                   dropout_rate=best_params['dropout_rate']).to(device)
print(best_params)


{'num_layers': 1, 'hidden_dim': 128, 'dropout_rate': 0.27589754228613206, 'lr': 0.003929971261903612, 'embedding_dim': 178, 'epochs': 2}




In [17]:
optimizer = optim.Adam(best_model.parameters(), lr=best_params['lr'])
criterion = nn.CrossEntropyLoss()
for epoch in range( best_params['epochs']):  # Reduced number of epochs for each trial
        best_model.train()
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = best_model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluate the model
        best_model.eval()
        all_preds = list()
        all_labels = list()
        with torch.no_grad():
            for texts, labels in test_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = best_model(texts)
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        accuracy = accuracy_score(all_labels, all_preds)

        print(f'The Epoch {epoch}  ','The accuracy is  ' ,accuracy)


The Epoch 0   The accuracy is   0.811732385261797
The Epoch 1   The accuracy is   0.8211053652230123
