<a href="https://colab.research.google.com/github/megha-uv/Syntactic-and-Semantic-based-COVID-19-Symptom-Extraction-and-Classification-from-Social-Media/blob/main/Code/Copy_of_FINALISED_MLP_BERT_NER_for_BIOE_labelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pip install seqeval



# 1. BERT NER





In [None]:
import numpy as np
from tqdm import tqdm, trange
from transformers import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertForTokenClassification, AdamW
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import accuracy_score, classification_report

In [None]:
path = '/content/drive/MyDrive/data/final__train(testing).tsv'
test = '/content/drive/MyDrive/data/final_test(testing).tsv'

Reading of Input and Labels

In [None]:
def dataToLabels(filename):
  datafile = open(filename)
  datanLabels = []
  sentence = []
  tags = []
  for line in datafile:
    if len(line)==0 or line[0]=="\n":
       if len(sentence) > 0:
         datanLabels.append(sentence)
         sentence = []
       continue
    splits = line.split('\t')
    sentence.append([splits[0],splits[-1].rstrip("\n")])

  if len(sentence) > 0:
     datanLabels.append(sentence)
     sentence = []
  return datanLabels

In [None]:
data = dataToLabels(path)

In [None]:
testData = dataToLabels(path)

Splitting of Words and Tags

In [None]:
def extractWordsNTags(data):
  sentences = [[word[0] for word in sentence] for sentence in data]
  labels = [[word[1] for word in sentence] for sentence in data]
  return sentences, labels

In [None]:
sentences, labels = extractWordsNTags(data)
Tags = list(set(labels[0]))
Tags.append('PAD')
tag2idx = {t: i for i, t in enumerate(Tags)}

In [None]:
Tags

['E', 'B', 'I', 'O', 'PAD']

In [None]:
model_path = '/content/drive/MyDrive/models/TrainedModels'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
#tokenizer = AutoTokenizer.from_pretrained("fidukm34/biobert_v1.1_pubmed-finetuned-ner-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False,
    ignore_mismatched_sizes=True
)
MAX_LEN = 75
batch_size = 32

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]
print(tokenized_texts)
print(labels)

[['cough', 'chest', 'pain', 'fever', 'd', '##ys', '##p', '##nea', '43', 'years', 'old', 'male', 'develop', 's', '##put', '##um', 'product', 'and', 'N', '##P', 's', '##wab', 'N', '##eg', '##ative', 'at', 'first', 'patient', 'then', 'de', '##ter', '##ior', '##ation', 'in', 'symptoms', 'with', 'my', '##al', '##ger', 'dry', 'CT', 'mild', 's', '##po', '##29', '##9', 'Con', '##s', '##cious', 'oriented', 'low', 'grade', 'no', 'di', '##ar', '##r', '##hea', 'good', 'urine', 'output', 'te', '##mp', '##38', '##7', 'BP', '130', '##80', 'CR', '##P', '##VE', 'S', '##U', '##rea', '20', 'DD', '##ime', '##r', '80', 'S', '##fer', '##rit', '##in', '24', '##low', '##1', 'P', '##ne', '##um', '##onia', 'due', 'to', 'co', '##vid', '##19', 'D', '##M', '##H', '##X', 'short', '##ness', 'of', 'breath'], ['short', '##ness', 'of', 'breath', 's', '##po', '##29', '##8', 'on', 'o', '##2', 'PR', '##10', '##5', 'chest', 'pain', 'Con', '##s', '##cious', 'oriented', 'a', '##fe', '##bri', '##le', 'soft', 'abdominal', 'cle

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab if tag2idx.get(l) is not None] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")



In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
train_inputs, valid_inputs, train_tags, valid_tags = train_test_split(input_ids, tags,
                                                            random_state=2021, test_size=0.1)

train_masks, valid_masks, _, _ = train_test_split (attention_masks, input_ids,
                                             random_state=2021, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
valid_inputs = torch.tensor(valid_inputs)
train_tags = torch.tensor(train_tags)
valid_tags = torch.tensor(valid_tags)
train_masks = torch.tensor(train_masks)
valid_masks = torch.tensor(valid_masks)

train_data = TensorDataset(train_inputs, train_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(valid_inputs, valid_masks, valid_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)




In [None]:
epochs = 2
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [None]:
epochs = 100

BERT VALIDATION

In [None]:
# Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for epoch in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch
        # clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`
        outputs = model(batch_input_ids, token_type_ids=None,
                        attention_mask=batch_input_mask, labels=batch_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)



    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss = 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(batch_input_ids, token_type_ids=None,
                            attention_mask = batch_input_mask, labels=batch_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [Tags[p_i] for predicted, actual in zip(predictions, true_labels)
                                 for p_i, l_i in zip(predicted, actual) if Tags[l_i] != "PAD"]
    valid_tags = [Tags[l_i] for l in true_labels
                                  for l_i in l if Tags[l_i] != "PAD"]
    print("Validation Accuracy: {}\n".format(accuracy_score(pred_tags, valid_tags)))



Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Average train loss: 0.12194202114829643


Epoch:   1%|          | 1/100 [00:28<47:29, 28.79s/it]

Validation loss: 0.008169398111931514
Validation Accuracy: 0.9978010606648557

Average train loss: 0.0028724360377322335


Epoch:   2%|▏         | 2/100 [00:57<47:10, 28.88s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002209248589197694


Epoch:   3%|▎         | 3/100 [01:25<45:41, 28.26s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022372022016069796


Epoch:   4%|▍         | 4/100 [01:53<45:03, 28.16s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002360811694576542


Epoch:   5%|▌         | 5/100 [02:21<44:25, 28.06s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0025526331031441632


Epoch:   6%|▌         | 6/100 [02:48<43:45, 27.93s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0020416128313894767


Epoch:   7%|▋         | 7/100 [03:16<43:18, 27.94s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002424846365208984


Epoch:   8%|▊         | 8/100 [03:44<42:47, 27.90s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002139594378925668


Epoch:   9%|▉         | 9/100 [04:12<42:22, 27.94s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002295059035104179


Epoch:  10%|█         | 10/100 [04:40<41:46, 27.85s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021200554722728315


Epoch:  11%|█         | 11/100 [05:07<41:12, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002227193189302103


Epoch:  12%|█▏        | 12/100 [05:35<40:41, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002007483608287978


Epoch:  13%|█▎        | 13/100 [06:03<40:13, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0019977341537870574


Epoch:  14%|█▍        | 14/100 [06:31<39:45, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002050913355025627


Epoch:  15%|█▌        | 15/100 [06:58<39:18, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002338503411224605


Epoch:  16%|█▌        | 16/100 [07:26<38:51, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002213026482529719


Epoch:  17%|█▋        | 17/100 [07:54<38:22, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024641274912402663


Epoch:  18%|█▊        | 18/100 [08:22<37:54, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024459510566564


Epoch:  19%|█▉        | 19/100 [08:49<37:26, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002293506727523088


Epoch:  20%|██        | 20/100 [09:17<36:58, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021413432661895916


Epoch:  21%|██        | 21/100 [09:45<36:31, 27.73s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023368606583543997


Epoch:  22%|██▏       | 22/100 [10:12<36:03, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002312631087774849


Epoch:  23%|██▎       | 23/100 [10:40<35:40, 27.80s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021638590487247497


Epoch:  24%|██▍       | 24/100 [11:09<35:20, 27.90s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002242211790443809


Epoch:  25%|██▌       | 25/100 [11:36<34:49, 27.86s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.001922485114311498


Epoch:  26%|██▌       | 26/100 [12:04<34:18, 27.82s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022063892709625445


Epoch:  27%|██▋       | 27/100 [12:32<33:49, 27.80s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024799140883260407


Epoch:  28%|██▊       | 28/100 [13:00<33:20, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002569700558738345


Epoch:  29%|██▉       | 29/100 [13:27<32:51, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024556315573350087


Epoch:  30%|███       | 30/100 [13:55<32:24, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.00212653377050575


Epoch:  31%|███       | 31/100 [14:23<31:56, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022356677590698168


Epoch:  32%|███▏      | 32/100 [14:51<31:28, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021463919448947042


Epoch:  33%|███▎      | 33/100 [15:18<31:00, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002534769948825295


Epoch:  34%|███▍      | 34/100 [15:46<30:31, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021074754301533927


Epoch:  35%|███▌      | 35/100 [16:14<30:03, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024924558263806186


Epoch:  36%|███▌      | 36/100 [16:42<29:35, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002094737601617132


Epoch:  37%|███▋      | 37/100 [17:09<29:08, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002322065108541317


Epoch:  38%|███▊      | 38/100 [17:37<28:41, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021329761199440863


Epoch:  39%|███▉      | 39/100 [18:05<28:13, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021336241501130108


Epoch:  40%|████      | 40/100 [18:33<27:45, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023184873652473256


Epoch:  41%|████      | 41/100 [19:00<27:17, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021566342449445717


Epoch:  42%|████▏     | 42/100 [19:28<26:49, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024042975671952314


Epoch:  43%|████▎     | 43/100 [19:56<26:21, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002449037517448339


Epoch:  44%|████▍     | 44/100 [20:24<25:53, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002117212508250411


Epoch:  45%|████▌     | 45/100 [20:51<25:26, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.00239800368923598


Epoch:  46%|████▌     | 46/100 [21:19<24:58, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022953418558566176


Epoch:  47%|████▋     | 47/100 [21:47<24:30, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022686417937888216


Epoch:  48%|████▊     | 48/100 [22:15<24:02, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0020796032053219866


Epoch:  49%|████▉     | 49/100 [22:42<23:34, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022240634429823223


Epoch:  50%|█████     | 50/100 [23:10<23:07, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002099274085006019


Epoch:  51%|█████     | 51/100 [23:38<22:38, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023161380899289403


Epoch:  52%|█████▏    | 52/100 [24:06<22:11, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022315321732415162


Epoch:  53%|█████▎    | 53/100 [24:33<21:45, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0019404136615134228


Epoch:  54%|█████▍    | 54/100 [25:01<21:17, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021586618514608264


Epoch:  55%|█████▌    | 55/100 [25:29<20:48, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002056826316487893


Epoch:  56%|█████▌    | 56/100 [25:56<20:19, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002417334432453465


Epoch:  57%|█████▋    | 57/100 [26:24<19:51, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023073744296733126


Epoch:  58%|█████▊    | 58/100 [26:52<19:24, 27.73s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002102317565432339


Epoch:  59%|█████▉    | 59/100 [27:20<18:57, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002093896709351718


Epoch:  60%|██████    | 60/100 [27:48<18:30, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002294027835031396


Epoch:  61%|██████    | 61/100 [28:15<18:02, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002162001590049305


Epoch:  62%|██████▏   | 62/100 [28:43<17:35, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022443934944663744


Epoch:  63%|██████▎   | 63/100 [29:11<17:07, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002139708480064976


Epoch:  64%|██████▍   | 64/100 [29:39<16:39, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022090618615894686


Epoch:  65%|██████▌   | 65/100 [30:06<16:12, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002629576293282512


Epoch:  66%|██████▌   | 66/100 [30:34<15:44, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.00201899391686311


Epoch:  67%|██████▋   | 67/100 [31:02<15:16, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023563663532567103


Epoch:  68%|██████▊   | 68/100 [31:30<14:48, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002207877523580785


Epoch:  69%|██████▉   | 69/100 [31:57<14:19, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002093125089766759


Epoch:  70%|███████   | 70/100 [32:25<13:52, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002070302792965188


Epoch:  71%|███████   | 71/100 [32:53<13:24, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023723289542591803


Epoch:  72%|███████▏  | 72/100 [33:21<12:56, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021442425790797535


Epoch:  73%|███████▎  | 73/100 [33:48<12:29, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021567920531892657


Epoch:  74%|███████▍  | 74/100 [34:16<12:01, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022512369041043498


Epoch:  75%|███████▌  | 75/100 [34:44<11:34, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002262957397975055


Epoch:  76%|███████▌  | 76/100 [35:12<11:06, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0029365649144678816


Epoch:  77%|███████▋  | 77/100 [35:39<10:38, 27.77s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0025336133942747596


Epoch:  78%|███████▊  | 78/100 [36:07<10:10, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.00224205686519032


Epoch:  79%|███████▉  | 79/100 [36:35<09:42, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023372329729574952


Epoch:  80%|████████  | 80/100 [37:03<09:14, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022428821423946988


Epoch:  81%|████████  | 81/100 [37:30<08:46, 27.71s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0021091990184846938


Epoch:  82%|████████▏ | 82/100 [37:58<08:19, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002674535018741153


Epoch:  83%|████████▎ | 83/100 [38:26<07:51, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002295009201642482


Epoch:  84%|████████▍ | 84/100 [38:54<07:23, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022544003727756824


Epoch:  85%|████████▌ | 85/100 [39:21<06:56, 27.74s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0022769198699012015


Epoch:  86%|████████▌ | 86/100 [39:49<06:28, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002510056717095294


Epoch:  87%|████████▋ | 87/100 [40:17<06:00, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024276847851818755


Epoch:  88%|████████▊ | 88/100 [40:44<05:32, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024909204807687167


Epoch:  89%|████████▉ | 89/100 [41:12<05:04, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002024984553024885


Epoch:  90%|█████████ | 90/100 [41:40<04:37, 27.73s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0020425278635229915


Epoch:  91%|█████████ | 91/100 [42:08<04:09, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002086645342334283


Epoch:  92%|█████████▏| 92/100 [42:35<03:42, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.001984264755532529


Epoch:  93%|█████████▎| 93/100 [43:03<03:14, 27.76s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002300292136222946


Epoch:  94%|█████████▍| 94/100 [43:31<02:46, 27.75s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0024020430201370105


Epoch:  95%|█████████▌| 95/100 [43:59<02:18, 27.73s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002231572769661023


Epoch:  96%|█████████▌| 96/100 [44:26<01:50, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.002241825838556157


Epoch:  97%|█████████▋| 97/100 [44:54<01:23, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0020927787109565757


Epoch:  98%|█████████▊| 98/100 [45:22<00:55, 27.71s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0020920014709083583


Epoch:  99%|█████████▉| 99/100 [45:49<00:27, 27.72s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336

Average train loss: 0.0023858255710151486


Epoch: 100%|██████████| 100/100 [46:17<00:00, 27.78s/it]

Validation loss: 0.010114502374563017
Validation Accuracy: 0.9972836631742336






In [None]:
model.save_pretrained('/content/drive/MyDrive/models/TrainedModels')

In [None]:
test = "/content/drive/MyDrive/data/final_test(testing).tsv"

READING OF DATA

In [None]:
def testDatatoSentences(dataPath):
    fileContents = open(dataPath)
    sentenceCollection = []
    sentenceTags = []
    tagCollection = []
    flag = 1
    for line in fileContents:
      if flag == 1:
        firstSplit = line.split('\t')
        Word = firstSplit[0]
        tag = firstSplit[-1].rstrip('\n')
        sentenceTags.append(tag)
        flag = 0
        continue
      if line == '\n':
        sentenceCollection.append(Word)
        tagCollection.append(sentenceTags)
        sentenceTags = []
        flag = 1
        continue
      splitted = line.split('\t')
      subsqWord = splitted[0]
      tag = splitted[-1].rstrip('\n')
      Word = Word + " " + subsqWord
      sentenceTags.append(tag)

    return sentenceCollection, tagCollection


In [None]:
testSentences, testLabels = testDatatoSentences(test)
print(testSentences)

['cough fever dyspnea 52 years female spo273 CT 70 pr 83 conscious oriented sever1 shortness of breath', '\n years female spo294 o2 pr 86 BP 12070 conscious oriented mod1', '\n conscious oriented spo298 O2 PR 79 mod1', '\n dyspnea conscious oriented no cough spo288 pr 103 o2 RBS 560 mod1', '\n conscious oriented no cough afebrile dyspnea spo294 low1 shortness of breath', '\n', 'dyspnea 75 years male spo283 o2 CT positive conscious oriented mild tachypnea mod1', '\n conscious oriented spo285 o2 no dyspnea mild tachypnea mod1', '\n chest pain dyspnea conscious oriented spo287 o2 good urine output mod1 shortness of breath', '\n', 'sleep spo295 o2 pr 88 low1', '\n conscious oriented spo290 low1', '\n cough dyspnea 45 years female spo294 o2 CT positive DLOC afebrile mod1 shortness of breath', '\n cough Headache dyspnea 52 years male spo280 CT positive conscious oriented tachypnea pallor afebrile headache abdominal distension BP 13090 sever1', '\n dyspnea unconscious spo295 o2 pr 106 RBS 44 

TOKENIZATION HANDLING

In [None]:
tokenCollection = []
labelCollection = []
test_loss = 0
model.to(device)
for testSentence in testSentences:
  tokenized_sentence = tokenizer.encode(testSentence)
  input_ids = torch.tensor([tokenized_sentence]).to(device)
  with torch.no_grad():
     output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  test_loss += output[0].mean().item()
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
    if token == '[CLS]' or token == '[SEP]':
      continue
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(Tags[label_idx])
        new_tokens.append(token)
  tokenCollection.append(new_tokens)
  labelCollection.append(new_labels)


In [None]:
print("{}\n{}\n{}".format(testLabels[0], labelCollection[0], tokenCollection[0]))

['B', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'E']
['B', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'E']
['cough', 'fever', 'dyspnea', '52', 'years', 'female', 'spo273', 'CT', '70', 'pr', '83', 'conscious', 'oriented', 'sever1', 'shortness', 'of', 'breath']


In [None]:
for predictionTag, token in zip(labelCollection[0],tokenCollection[0]):
  print("{}\t{}\n".format(predictionTag,token))

B	cough

B	fever

B	dyspnea

O	52

O	years

O	female

O	spo273

O	CT

O	70

O	pr

O	83

O	conscious

O	oriented

O	sever1

B	shortness

I	of

E	breath



WRITING OF BERT'S CLINICAL TEXT OUTPUT INTO TXT FILE

In [None]:

with open("output.txt", "w") as file:
    # Loop through each sentence
    for predictionTag, token in zip(labelCollection,tokenCollection):
        # Loop through the predictions and words for each sentence
        for predictionTag1, word in zip(predictionTag,token):
            # Write the formatted output to the file
            file.write("{}\t{}\n".format(predictionTag1, word))
        # Add a newline to separate sentences
        file.write("\n")


BERT'S SOCIAL MEDIA PART

In [None]:
def testDatatoSentences1(dataPath):
    fileContents = open(dataPath)
    sentenceCollection = []
    sentenceTags = []
    tagCollection = []
    flag = 1
    for line in fileContents:
      if flag == 1:
        firstSplit = line.split('\t')
        Word = firstSplit[0]
        tag = firstSplit[-1].rstrip('\n')
        sentenceTags.append(tag)
        flag = 0
        continue
      if line == '\n':
        sentenceCollection.append(Word)
        tagCollection.append(sentenceTags)
        sentenceTags = []
        flag = 1
        continue
      splitted = line.split('\t')
      subsqWord = splitted[0]
      tag = splitted[-1].rstrip('\n')
      Word = Word + " " + subsqWord
      sentenceTags.append(tag)

    return sentenceCollection, tagCollection


In [None]:
import pandas as pd
# Example usage:
test = '/content/output_1.tsv'
testsentences1, testlabels1 = testDatatoSentences1(test)
print(testsentences1)


['cough chest pain developed a dry that was pretty aggressive and uncontrollable at times It subsided for bit then came back later in January with the same fits couple days transformed into more mucusy oneI have had other random symptoms come go past month too stiff neck drowsiness wheezing also still persists headaches primarily\n My cough shortness of breath been present little over 3 weeks now which has taken toll on going to gym being able do any physical activity matter', '\n feverish On Sunday night I felt and my body aches', '\n fever headache she never had any severe symptoms just a and felt weird By evening i was feeling like achey cold numbness in my face mouthi woke up with scratchy hot throat', '\n cough feverish headache I am in so much pain worst of my life sweaty heartburn exhausted but cant sleep', '\n fever sore throat headache Day 1 scratchy 2 Massive mild Low grade Positive COVID test 3 no tingling and prickling in legs that kept me awake kind of like restless leg sy

TOKENIZATION HANDLING

In [None]:
tokenCollection1 = []
labelCollection1 = []
test_loss = 0
model.to(device)
for testSentence in testsentences1:
  tokenized_sentence = tokenizer.encode(testSentence)
  input_ids = torch.tensor([tokenized_sentence]).to(device)
  with torch.no_grad():
     output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  test_loss += output[0].mean().item()
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
    if token == '[CLS]' or token == '[SEP]':
      continue
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(Tags[label_idx])
        new_tokens.append(token)
  tokenCollection1.append(new_tokens)
  labelCollection1.append(new_labels)



In [None]:
for predictionTag, token in zip(labelCollection1[0],tokenCollection1[0]):
  print("{}\t{}\n".format(predictionTag,token))

B	cough

B	chest

E	pain

O	developed

O	a

O	dry

O	that

O	was

O	pretty

O	aggressive

O	and

O	uncontrollable

O	at

O	times

O	It

O	subsided

O	for

O	bit

O	then

O	came

O	back

O	later

O	in

O	January

O	with

O	the

O	same

O	fits

O	couple

O	days

O	transformed

O	into

O	more

O	mucusy

O	oneI

O	have

O	had

O	other

O	random

O	symptoms

O	come

O	go

O	past

O	month

O	too

O	stiff

O	neck

O	drowsiness

O	wheezing

O	also

O	still

O	persists

B	headaches

E	primarily

E	My

E	cough

B	shortness

B	of

E	breath

E	been

E	present

E	little

E	over

O	3

O	weeks

O	now

O	which

O	has

O	taken

O	toll

O	on

O	going

O	to

O	gym

O	being

O	able

O	do

O	any

O	physical

O	activity

O	matter



In [None]:

with open("output_BERT.txt", "w") as file:
    # Loop through each sentence
    for predictionTag, token in zip(labelCollection1,tokenCollection1):
        # Loop through the predictions and words for each sentence
        for predictionTag1, word in zip(predictionTag,token):
            # Write the formatted output to the file
            file.write("{}\t{}\n".format(predictionTag1, word))
        # Add a newline to separate sentences
        file.write("\n")


PERFORMANCE EVALUVATION OF BERT

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd

# Load data from TSV files
ground_truth_df = pd.read_csv("/content/final_test(testing).tsv", sep="\t", names=['Text', 'NER_Tag'])
predicted_df = pd.read_csv("/content/output.txt", sep="\t", names=['NER_Tag_BERT','Text_BERT'])

# Assuming both files have the same number of rows and same order of data points

# Extract labels from dataframes
y_true = ground_truth_df["NER_Tag"]
y_pred = predicted_df["NER_Tag_BERT"]

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9879217408664404
Precision: 0.9863349389706604
Recall: 0.9879217408664404
F1 Score: 0.9868135253301564


MULTI LAYER PERCEPTRON FOR CLINICAL TEXT

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load the TSV training dataset
train_file_path = '/content/final__train(testing).tsv'
train_data = pd.read_csv(train_file_path, sep='\t', header=None, names=['Text', 'NER_Tag'])

# Remove NaN values and clean up labels in training data
train_data = train_data.dropna()
train_data['NER_Tag'] = train_data['NER_Tag'].str.strip()

# Extract features and labels
train_texts = train_data['Text'].tolist()
train_ner_tags = train_data['NER_Tag'].tolist()

# Convert NER tags to numerical labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_train_labels = label_encoder.fit_transform(train_ner_tags)

# Create the CountVectorizer for training data
vectorizer = CountVectorizer()
train_texts_vectorized = vectorizer.fit_transform(train_texts).toarray()

# Convert data to PyTorch tensors
train_texts_tensor = torch.tensor(train_texts_vectorized, dtype=torch.float32)
train_labels_tensor = torch.tensor(encoded_train_labels, dtype=torch.long)

# Create DataLoader for training data
train_dataset = TensorDataset(train_texts_tensor, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.fc4 = nn.Linear(hidden_size3, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        x = self.softmax(x)
        return x

# Adjust hidden layer sizes based on your choice
model = MLP(input_size=train_texts_tensor.shape[1], hidden_size1=200, hidden_size2=100, hidden_size3=40, output_size=len(label_encoder.classes_))

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
train_losses = []  # Store training loss values across epochs
for epoch in range(epochs):
    epoch_loss = 0.0  # Initialize loss for each epoch
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

# Load the TSV test dataset
test_file_path = '/content/final_test(testing).tsv'
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=['Text', 'NER_Tag'])

# Remove NaN values and clean up labels in test data
test_data = test_data.dropna()
test_data['NER_Tag'] = test_data['NER_Tag'].str.strip()

# Extract features and labels for test data
test_texts = test_data['Text'].tolist()
test_ner_tags = test_data['NER_Tag'].tolist()

# Convert NER tags to numerical labels using LabelEncoder
encoded_test_labels = label_encoder.transform(test_ner_tags)

# Vectorize the test data
test_texts_vectorized = vectorizer.transform(test_texts).toarray()
test_texts_tensor = torch.tensor(test_texts_vectorized, dtype=torch.float32)
test_labels_tensor = torch.tensor(encoded_test_labels, dtype=torch.long)

# Create DataLoader for the test set
test_dataset = TensorDataset(test_texts_tensor, test_labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Sample input sentence
input_sentence = "cough fever dyspnea 52 years female spo273 CT 70 pr 83 conscious oriented sever1 chest pain shortness of breath"

# Preprocess the sentence
input_words = input_sentence.split()
input_words_vectorized = vectorizer.transform(input_words).toarray()
input_words_tensor = torch.tensor(input_words_vectorized, dtype=torch.float32)

# Ensure the model is in evaluation mode
model.eval()

# Make predictions for each word in the input sentence
with torch.no_grad():
    output = model(input_words_tensor)
    _, predicted_labels = torch.max(output, 1)

# Convert the predicted labels back to the original NER tags
predicted_tags = label_encoder.inverse_transform(predicted_labels.numpy())

# Display results
for predictionTag1, word in zip(predicted_tags, input_words):
    print("{}\t{}\n".format(predictionTag1, word))

# Evaluate model on the test set and print classification report
model.eval()

# Evaluate model on the test set
predictions = []
true_labels = []

# Make predictions for each batch in the test dataset
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        output = model(batch_X)
        _, predicted_labels = torch.max(output, 1)
        predictions.extend(predicted_labels.tolist())
        true_labels.extend(batch_y.tolist())

# Convert the predicted labels back to the original NER tags
predicted_tags = label_encoder.inverse_transform(predictions)

# Convert true labels to the original NER tags
true_tags = label_encoder.inverse_transform(true_labels)

# Calculate precision, recall, F1-score, and accuracy
precision = precision_score(true_tags, predicted_tags, average='weighted')
recall = recall_score(true_tags, predicted_tags, average='weighted')
f1 = f1_score(true_tags, predicted_tags, average='weighted')
accuracy = accuracy_score(true_tags, predicted_tags)

# Display the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


B	cough

B	fever

B	dyspnea

O	52

O	years

O	female

O	spo273

O	CT

O	70

O	pr

O	83

O	conscious

O	oriented

O	sever1

B	chest

E	pain

B	shortness

O	of

E	breath

Accuracy: 0.9697544420043921
Precision: 0.9717649204383573
Recall: 0.9697544420043921
F1-score: 0.9698604536587999


WRITING MLP'S OUTPUT INTO TEXT FILE

In [None]:
def mlp_output(test_text):
    model.eval()
    output_string = ""
    words = test_text.split()
    # Make predictions for each word in the sentence
    for word in words:
        # Preprocess the word
        input_word_vectorized = vectorizer.transform([word]).toarray()
        input_word_tensor = torch.tensor(input_word_vectorized, dtype=torch.float32)
        # Make prediction for the word
        with torch.no_grad():
            output = model(input_word_tensor)
            _, predicted_label = torch.max(output, 1)
        # Convert the predicted label back to the original NER tag
        predicted_tag = label_encoder.inverse_transform(predicted_label.numpy()).tolist()
        # Append the output to the string
        output_string += "{}\t{}\n".format(predicted_tag[0], word)
    return output_string

# Open a text file for writing the output
with open("output_file.txt", "w") as file:
    # Iterate through test data
    for test_sentence in test_texts:
        # Get the output string for the current sentence
        output_string = mlp_output(test_sentence)
        # Write the output to the file
        file.write(output_string)
        # Add a newline after each test sentence
        file.write("\n")

MLP'S SOCIAL MEDIA PART

In [None]:
import pandas as pd

# Load your new dataset from an Excel file
new_dataset_file_path_xlsx = '/content/drive/MyDrive/data/socialmedia.xlsx'
new_dataset_xlsx = pd.read_excel(new_dataset_file_path_xlsx, sheet_name='Sheet1')

# Ensure the model is in evaluation mode
model.eval()

for index, row in new_dataset_xlsx.iterrows():
    sentence = row['socialmedia']

    # Process each word individually
    for word in sentence.split():
        # Remove punctuation from the word
        word = ''.join(char for char in word if char.isalnum())
        # Vectorize the input word using the same CountVectorizer
        input_word_vectorized = vectorizer.transform([word]).toarray()
        input_word_tensor = torch.tensor(input_word_vectorized, dtype=torch.float32)

        # Make predictions for the input word
        with torch.no_grad():
            output = model(input_word_tensor)
            _, predicted_label = torch.max(output, 1)

        # Convert the predicted label back to the original NER tag
        predicted_tag = label_encoder.inverse_transform(predicted_label.numpy())

        # Display result for the current word
        print("{}\t{}".format(predicted_tag[0], word))

    print()  # Add a newline to separate outputs for different sentences




O	developed
O	a
O	dry
B	cough
O	that
O	was
O	pretty
O	aggressive
O	and
O	uncontrollable
O	at
O	times
O	It
O	subsided
O	for
O	a
O	bit
O	then
O	came
O	back
O	later
O	in
O	January
O	with
O	the
O	same
O	aggressive
O	fits
O	for
O	a
O	couple
O	days
O	then
O	transformed
O	into
O	a
O	more
O	mucusy
O	oneI
O	have
O	had
O	other
O	random
O	symptoms
O	come
O	and
O	go
O	in
O	the
O	past
O	month
O	too
O	stiff
O	neck
O	drowsiness
O	wheezing
O	also
O	still
O	persists
B	chest
E	pain
O	and
O	headaches
O	primarilyMy
O	coughshortness
O	of
E	breath
O	have
O	been
O	present
O	for
O	a
O	little
O	over
O	3
O	weeks
O	now
O	which
O	has
O	taken
O	a
O	toll
O	on
O	going
O	to
O	the
O	gym
O	and
O	being
O	able
O	to
O	do
O	any
O	physical
O	activity
O	for
O	that
O	matter

O	On
O	Sunday
O	night
O	I
O	felt
O	feverish
O	and
O	my
O	body
O	ached

O	she
O	never
O	had
O	any
O	severe
O	symptoms
O	just
O	a
O	headache
O	and
O	felt
O	weirdBy
O	evening
O	i
O	was
O	feeling
O	like
O	i
O	had
O	a
B	fever
O	achey
O	cold
O	headache
O	numbne

In [None]:
import re

def extract_words(sentence):
    # Use regular expression to find alphanumeric words
    words = re.findall(r'\b\w+\b', sentence)
    return words

# Define the file path for the output text file
output_file_path = "output_file1.txt"

# Open the text file for writing
with open(output_file_path, "w") as output_file:
    # Iterate through the dataset
    for index, row in new_dataset_xlsx.iterrows():
        sentence = row['socialmedia']
        symptom_output = ""  # String to hold symptom words and their predictions
        rest_output = ""     # String to hold non-symptom words and their predictions

        # Process each word individually
        for word in extract_words(sentence):  # Use the extract_words function here
            # Vectorize the input word using the same CountVectorizer
            input_word_vectorized = vectorizer.transform([word]).toarray()
            input_word_tensor = torch.tensor(input_word_vectorized, dtype=torch.float32)

            # Make predictions for the input word
            with torch.no_grad():
                output = model(input_word_tensor)
                _, predicted_label = torch.max(output, 1)

            # Convert the predicted label back to the original NER tag
            predicted_tag = label_encoder.inverse_transform(predicted_label.numpy())

            # Append the output to the appropriate string based on whether the word is a symptom word or not
            if predicted_tag[0].startswith(("B", "I", "E")):  # Check if the predicted tag starts with "B", "I", or "E"
                symptom_output += "{}\t{}\n".format(predicted_tag[0], word)
            else:
                rest_output += "{}\t{}\n".format(predicted_tag[0], word)

        # Write symptom words first, then rest of the words into the file
        output_file.write(symptom_output + rest_output)
        output_file.write("\n")  # Add a newline to separate outputs for different sentences

# Print message after writing the output
print("Output written to:", output_file_path)


Output written to: output_file1.txt


INTGERATION OF CLINICAL TEXT'S OUTPUT

In [None]:
from tabulate import tabulate

# Function to read words with B, I, or E tags from a file
def read_words_with_bie_tags(file_path):
    words = []
    tags = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:  # Ensure the line has the expected format
                tag, word = parts
                words.append(word)
                tags.append(tag)
    return words, tags

# Function to determine the final tag based on SYM1 and SYM2 priority values
# Function to determine the final tag based on SYM1 and SYM2 priority values
def determine_final_tag(sym1, sym2, priority_values):
    priority_sym1 = priority_values.get(sym1,0.0)   # Get priority value or default to 0 if not found
    priority_sym2 = priority_values.get(sym2,0.0)   # Get priority value or default to 0 if not found
    if sym1 != 'O' and sym2 != 'O' :
      if sym1 == sym2:
            final_tag = sym2
      else:
            final_tag = sym1 if priority_sym1 > priority_sym2 else sym2

      return final_tag


# Define priority values
priority_values = {'B': 0.7, 'I': 0.5, 'E': 0.3, 'O': 0.0}

# Read words and their tags from BERT's output
bert_words, bert_tags = read_words_with_bie_tags('/content/output.txt')

# Read words and their tags from MLP's output
mlp_words, mlp_tags = read_words_with_bie_tags('/content/output_file.txt')

# Find words with B, I, or E tags for the second column
bie_words = [word for word, tag in zip(mlp_words, mlp_tags) if tag in ['B', 'I', 'E']]

# Determine the maximum width of each column
max_sentence_width = max(len(word) for word in bert_words)
max_bie_width = max(len(word) for word in bie_words)
max_bert_tag_width = max(len(tag) for tag in bert_tags)
max_mlp_tag_width = max(len(tag) for tag in mlp_tags)

# Determine final tags for each word
final_tags = []
for sym1, sym2 in zip(bert_tags, mlp_tags):
    final_tag = determine_final_tag(sym1, sym2, priority_values)
    final_tags.append(final_tag)
# Create data for table
data = []
for word, bert_tag, mlp_tag, final_tag in zip(bert_words, bert_tags, mlp_tags, final_tags):
    bie_word = bie_words.pop(0) if bert_tag in ['B', 'I', 'E'] else ''
    data.append([word, bert_tag, mlp_tag, final_tag])

# Define headers
headers = ["Sentence", "BERT Tags", "MLP Tags", "Final Tags"]

# Print the header
print(tabulate(data, headers=headers, tablefmt="grid"))


+-------------------+-------------+------------+--------------+
| Sentence          | BERT Tags   | MLP Tags   | Final Tags   |
| cough             | B           | B          | B            |
+-------------------+-------------+------------+--------------+
| fever             | B           | B          | B            |
+-------------------+-------------+------------+--------------+
| dyspnea           | B           | B          | B            |
+-------------------+-------------+------------+--------------+
| 52                | O           | O          | O            |
+-------------------+-------------+------------+--------------+
| years             | O           | O          | O            |
+-------------------+-------------+------------+--------------+
| female            | O           | O          | O            |
+-------------------+-------------+------------+--------------+
| spo273            | O           | O          | O            |
+-------------------+-------------+-----

PERFORMANCE EVALUVATION OF INTEGRATED MODEL

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd

# Load data from TSV files
ground_truth_df = pd.read_csv("/content/final_test(testing).tsv", sep="\t", names=['Text', 'NER_Tag'])

# Assuming both files have the same number of rows and same order of data points

# Extract labels from dataframes
y_true = ground_truth_df["NER_Tag"] # Assuming the labels are in the first column
y_pred = final_tags      # Assuming the predicted labels are in the first column

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

# Calculate precision
precision = precision_score(y_true, y_pred, average='weighted')

# Calculate recall
recall = recall_score(y_true, y_pred, average='weighted')

# Calculate F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

# Generate classification report
report = classification_report(y_true, y_pred)

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.8993811139948094
Precision: 0.9425052661942601
Recall: 0.8993811139948094
F1 Score: 0.9114632922099504


INTEGERATION OF SOCIAL MEDIA'S TEXTS

In [None]:
from tabulate import tabulate

# Function to read sentences with their tags from a file
def read_sentences_with_tags(file_path):
    sentences = []
    with open(file_path, 'r') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line:  # Non-empty line
                parts = line.split('\t')
                if len(parts) == 2:
                    tag, word = parts
                    sentence.append((word, tag))
            else:  # Empty line indicates end of sentence
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                    sentences.append([])  # Add a blank line between sentences
        if sentence:  # Add the last sentence if not yet added
            sentences.append(sentence)
    return sentences

def determine_final_tag(sym1, sym2, priority_values):
    priority_sym1 = priority_values.get(sym1,0.0)   # Get priority value or default to 0 if not found
    priority_sym2 = priority_values.get(sym2,0.0)   # Get priority value or default to 0 if not found
    if sym1 != 'O' and sym2 != 'O':
      if sym1 == sym2:
          final_tag = sym2
      else:
          final_tag = sym1 if priority_sym1 > priority_sym2 else sym2

    return final_tag

# Define priority values
priority_values = {'B': 0.7, 'I': 0.5, 'E': 0.3, 'O': 0.0}

# Read sentences and their tags from BERT's output
bert_sentences = read_sentences_with_tags('/content/output_BERT.txt')

# Read sentences and their tags from MLP's output
mlp_sentences = read_sentences_with_tags('/content/output_file1.txt')

# Initialize data for table
data = []

# Process each sentence separately
for bert_sentence, mlp_sentence in zip(bert_sentences, mlp_sentences):
    # Initialize sentence data
    sentence_data = []

    # Process each word-tag pair in the sentence
    for (bert_word, bert_tag), (mlp_word, mlp_tag) in zip(bert_sentence, mlp_sentence):
        final_tag = determine_final_tag(bert_tag, mlp_tag, priority_values)
        sentence_data.append((bert_word, bert_tag, mlp_tag, final_tag))

    # Append sentence data to the overall data
    data.extend(sentence_data)
    data.append([])  # Add a blank line between sentences

# Define headers
headers = ["Word", "BERT Tags", "MLP Tags", "Final Tags"]


In [None]:
# Write sentences and final tags into a text file
def write_sentences_with_final_tags(file_path, data):
    with open(file_path, 'a') as f:  # Open file in append mode
        for word, final_tag in data:
            f.write(f"{word}\t{final_tag}\n")
        f.write('\n')  # Add a blank line between sentences

# Process each sentence separately
for bert_sentence, mlp_sentence in zip(bert_sentences, mlp_sentences):
    # Initialize sentence data
    sentence_data = []

    # Process each word-tag pair in the sentence
    for (bert_word, bert_tag), (mlp_word, mlp_tag) in zip(bert_sentence, mlp_sentence):
        final_tag = determine_final_tag(bert_tag, mlp_tag, priority_values)
        sentence_data.append((bert_word, final_tag))

    # Write sentence data to file
    write_sentences_with_final_tags('/content/final_tags.txt', sentence_data)



PERFORMANCE EVALUVATION OF CLASSFIERS TRAINED USING CLINCIAL TEXT DATA LABELLED WITH SEVERITY

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,classification_report
import numpy as np

# Load the TSV dataset
df = pd.read_excel("/content/Book1.xlsx")

# Preprocess the data
def preprocess_data(data):
    # Drop any NaN values
    data.dropna(inplace=True)
    X = data["Clinical Text"].tolist()  # Assuming the sentences are in the first column
    y= data["Label"].tolist()

    return X, y


# Preprocess the data
X, y = preprocess_data(df)

# Use CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_report = classification_report(y_test, dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(dt_report)

# AdaBoost Classifier
adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(X_train, y_train)
adaboost_predictions = adaboost_classifier.predict(X_test)
adaboost_accuracy = accuracy_score(y_test, adaboost_predictions)
adaboost_report = classification_report(y_test, adaboost_predictions)
print("AdaBoost Accuracy:", adaboost_accuracy)
print("AdaBoost Classification Report:")
print(adaboost_report)

# Support Vector Machine Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:")
print(svm_report)

# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)
gb_predictions = gb_classifier.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_report = classification_report(y_test, gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Classification Report:")
print(gb_report)

Decision Tree Accuracy: 0.8831858407079646
Decision Tree Classification Report:
              precision    recall  f1-score   support

  non-severe       0.86      0.86      0.86       239
      severe       0.90      0.90      0.90       326

    accuracy                           0.88       565
   macro avg       0.88      0.88      0.88       565
weighted avg       0.88      0.88      0.88       565

AdaBoost Accuracy: 0.9185840707964602
AdaBoost Classification Report:
              precision    recall  f1-score   support

  non-severe       0.90      0.90      0.90       239
      severe       0.93      0.93      0.93       326

    accuracy                           0.92       565
   macro avg       0.92      0.92      0.92       565
weighted avg       0.92      0.92      0.92       565

SVM Accuracy: 0.9026548672566371
SVM Classification Report:
              precision    recall  f1-score   support

  non-severe       0.86      0.92      0.89       239
      severe       0.94    

SEVERITY CLASSIFICATION FOR SOCIAL MEDIA DATASET using CLINICAL TEXT TRAINED MODEL

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# Load the Excel file with each sentence in a separate row
new_df = pd.read_excel("/content/socialmedia.xlsx")

# Preprocess the new data
def preprocess_new_data(data):
    # Drop any NaN values
    data.dropna(inplace=True)
    sentences = data["socialmedia"].tolist()  # Assuming the sentences are in the first column
    return sentences

# Preprocess the new data
sentences = preprocess_new_data(new_df)

# Use the same CountVectorizer used for training to transform the new data
X_new_transformed = vectorizer.transform(sentences)

# Use the trained models to predict severity labels for the new data
dt_predictions_new = dt_classifier.predict(X_new_transformed)
rf_predictions_new = rf_classifier.predict(X_new_transformed)
svm_predictions_new = svm_classifier.predict(X_new_transformed)
gb_predictions_new = gb_classifier.predict(X_new_transformed)

for i in range(min(10, len(sentences))):
    print("Sentence:", sentences[i])
    print("Decision Tree Prediction:", dt_predictions_new[i])
    print("Random Forest Prediction:", rf_predictions_new[i])
    print("SVM Prediction:", svm_predictions_new[i])
    print("Gradient Boosting Prediction:", gb_predictions_new[i])
    print()

# Combine all predictions into one list of lists
all_predictions = [dt_predictions_new, rf_predictions_new, svm_predictions_new, gb_predictions_new]

# List to store final predictions
final_predictions = []

# Iterate over each sentence's predictions
for predictions in zip(*all_predictions):
    # Count occurrences of each label
    label_counts = {}
    for label in predictions:
        label_counts[label] = label_counts.get(label, 0) + 1
    # Find the label with maximum count
    final_label = max(label_counts, key=label_counts.get)
    final_predictions.append(final_label)

# Create a DataFrame to store sentences and final predictions
result_df = pd.DataFrame({'Sentence': sentences, 'Final Prediction': final_predictions})

# Write the DataFrame to an Excel file
result_df.to_excel("/content/final_predictions.xlsx", index=False)



Sentence:  developed a dry cough that was pretty aggressive and uncontrollable at times. It subsided for a bit, then came back later in January with the same aggressive fits for a couple days, then transformed into a more mucus-y one.I have had other random symptoms come and go in the past month too: stiff neck, drowsiness, wheezing (also still persists), chest pain and headaches primarily.My cough,shortness of breath have been present for a little over 3 weeks now which has taken a toll on going to the gym and being able to do any physical activity for that matter.
Decision Tree Prediction: severe
Random Forest Prediction: non-severe
SVM Prediction: severe
Gradient Boosting Prediction: non-severe

Sentence: On Sunday night I felt feverish and my body ached,cough,congestion.
Decision Tree Prediction: non-severe
Random Forest Prediction: non-severe
SVM Prediction: non-severe
Gradient Boosting Prediction: non-severe

Sentence:  she never had any severe symptoms, just a headache and felt 

ACCURACY, CLASSIFICAITON REPORT (SEVERITY LABELS USING MAX(PREDICTIONS)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the TSV dataset
df = pd.read_excel("/content/final_predictions.xlsx")

# Preprocess the data
def preprocess_data(data):
    # Drop any NaN values
    data.dropna(inplace=True)
    X = data["Sentence"].tolist()  # Assuming the sentences are in the first column
    y = data["Final Prediction"].tolist()
    return X, y

# Preprocess the data
X, y = preprocess_data(df)


# Use CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Shuffle the data before splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Train and evaluate classifiers...

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions, average='weighted')
dt_recall = recall_score(y_test, dt_predictions, average='weighted')
dt_f1_score = f1_score(y_test, dt_predictions, average='weighted')

print("Decision Tree Accuracy:", dt_accuracy)
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree F1-score:", dt_f1_score)

# AdaBoost Classifier
ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.1)
ada_classifier.fit(X_train, y_train)
ada_predictions = ada_classifier.predict(X_test)
ada_accuracy = accuracy_score(y_test, ada_predictions)
ada_precision = precision_score(y_test, ada_predictions,average='weighted')
ada_recall = recall_score(y_test, ada_predictions,average='weighted')
ada_f1_score = f1_score(y_test, ada_predictions,average='weighted')

print("AdaBoost Accuracy:", ada_accuracy)
print("AdaBoost Precision:", ada_precision)
print("AdaBoost Recall:", ada_recall)
print("AdaBoost F1-score:", ada_f1_score)

# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(X_train, y_train)
gb_predictions = gb_classifier.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_precision = precision_score(y_test, gb_predictions,average='weighted')
gb_recall = recall_score(y_test, gb_predictions,average='weighted')
gb_f1_score = f1_score(y_test, gb_predictions,average='weighted')

print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Precision:", gb_precision)
print("Gradient Boosting Recall:", gb_recall)
print("Gradient Boosting F1-score:", gb_f1_score)

# Support Vector Machine Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions,average='weighted')
svm_recall = recall_score(y_test, svm_predictions,average='weighted')
svm_f1_score = f1_score(y_test, svm_predictions,average='weighted')

print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)
print("SVM F1-score:", svm_f1_score)


Decision Tree Accuracy: 0.6153846153846154
Decision Tree Precision: 0.5871794871794871
Decision Tree Recall: 0.6153846153846154
Decision Tree F1-score: 0.5980335454019664
AdaBoost Accuracy: 0.7692307692307693
AdaBoost Precision: 0.758974358974359
AdaBoost Recall: 0.7692307692307693
AdaBoost F1-score: 0.75882012724118
Gradient Boosting Accuracy: 0.7692307692307693
Gradient Boosting Precision: 0.758974358974359
Gradient Boosting Recall: 0.7692307692307693
Gradient Boosting F1-score: 0.75882012724118
SVM Accuracy: 0.6923076923076923
SVM Precision: 0.47928994082840237
SVM Recall: 0.6923076923076923
SVM F1-score: 0.5664335664335665


  _warn_prf(average, modifier, msg_start, len(result))


SEVERITY ASSIGNING USING NER TAGS BASED ON PRIORITY TESTING

In [None]:
import pandas as pd

# Function to read sentences with their final tags from a file
def read_sentences_with_final_tags(file_path):
    sentences = []
    with open(file_path, 'r') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line:  # Non-empty line
                parts = line.split('\t')
                if len(parts) == 2:  # Ensure the line has the expected format
                    word, final_tag = parts
                    sentence.append((word, final_tag))
            else:  # Empty line indicates end of sentence
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:  # Add the last sentence if not yet added
            sentences.append(sentence)
    return sentences

# Function to predict severity based on the count of symptom indicators (B tags)
def predict_severity(sentences_with_final_tags):
    severity_predictions = []

    for sentence_with_final_tags in sentences_with_final_tags:
        symptom_count = sum(1 for _, tag in sentence_with_final_tags if tag == 'B')
        severity = 'Severe' if symptom_count >= 1 else 'Not Severe'  # Adjust the threshold as needed
        severity_predictions.append((sentence_with_final_tags, severity))

    return severity_predictions

# Read sentences and final tags from the file
sentences_with_final_tags = read_sentences_with_final_tags('/content/final_tags.txt')

# Predict severity based on the count of symptom indicators (B tags)
severity_predictions = predict_severity(sentences_with_final_tags)

# Store the results in a DataFrame
results_df = pd.DataFrame(columns=['Sentence', 'Severity'])
for i, (sentence_with_final_tags, severity) in enumerate(severity_predictions):
    sentence = ' '.join(word for word, _ in sentence_with_final_tags)
    results_df.loc[i] = [sentence, severity]

# Write the DataFrame to an Excel file
results_df.to_excel('/content/severity_predictions.xlsx', index=False)


SEVERITY ASSIGNING USING NER TAGS BASED ON HIGH PRIORITY SYMPTOMS

In [None]:
import pandas as pd
import string

# Function to read sentences with their final tags from a file
def read_sentences_with_final_tags(file_path):
    sentences = []
    with open(file_path, 'r') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line:  # Non-empty line
                parts = line.split('\t')
                if len(parts) == 2:  # Ensure the line has the expected format
                    word, final_tag = parts
                    # Remove punctuation and extract only words
                    word = ''.join(char for char in word if char not in string.punctuation)
                    if word:  # Non-empty word after removing punctuation
                        sentence.append((word.lower(), final_tag))  # Convert to lowercase
            else:  # Empty line indicates end of sentence
                if sentence:
                    sentences.append(sentence)
                    sentence = []
    return sentences

# Function to predict severity based on the presence of high-priority symptoms
def predict_severity(sentences_with_final_tags):
    severity_predictions = []

    high_priority_symptoms = {'shortness of breath', 'difficulty in breathing', 'chest pain','congestion'}

    for sentence_with_final_tags in sentences_with_final_tags:
        words = [word for word, tag in sentence_with_final_tags]
        sentence = ' '.join(words)
        symptom_count = sum(1 for word, tag in sentence_with_final_tags if tag == 'B')
        if any(symptom in sentence for symptom in high_priority_symptoms):
            severity = 'Severe'
        elif symptom_count > 3:
            severity = 'Severe'
        else:
            severity = 'Not Severe'
        severity_predictions.append((sentence_with_final_tags, severity))

    return severity_predictions

# Read sentences and final tags from the file
sentences_with_final_tags = read_sentences_with_final_tags('/content/final_tags.txt')

# Predict severity based on the presence of high-priority symptoms
severity_predictions = predict_severity(sentences_with_final_tags)

# Print sentences with predicted severity for debugging
for sentence_with_final_tags, severity in severity_predictions:
    sentence = ' '.join(word for word, _ in sentence_with_final_tags)
    print(f"Sentence: {sentence}, Severity: {severity}")

# Store the results in a DataFrame
results_df = pd.DataFrame(columns=['Sentence', 'Severity'])
for i, (sentence_with_final_tags, severity) in enumerate(severity_predictions):
    sentence = ' '.join(word for word, _ in sentence_with_final_tags)
    results_df.loc[i] = [sentence, severity]

# Write the DataFrame to an Excel file
results_df.to_excel('/content/severity_predictions.xlsx', index=False)


Sentence: shortness of breath cough chest pain developed a dry that was pretty aggressive and uncontrollable at times it subsided for bit then came back later in january with the same fits couple days transformed into more mucus one i have had other random symptoms come go past month too stiff neck drowsiness wheezing also still persists headaches primarily my have been present little over 3 weeks now which has taken toll on going to gym being able do any physical activity matter, Severity: Severe
Sentence: on sunday night i felt feverish and my body ached, Severity: Not Severe
Sentence: fever headache she never had any severe symptoms just a headache and felt weird by evening i was feeling like achey cold numbness in my face mouth woke up with scratchy throat, Severity: Severe
Sentence: cough headache i in so much pain worst headache of my life feverish sweaty heartburn exhausted but, Severity: Severe
Sentence: fever sore throat headache day 1 scratchy 2 massive headache mild low grad

SAMPLE OUTPUT FOR SOCIAL MEDIA POSTS FROM MODEL TRAINED USING NER TAGS

In [None]:
import random

# Fit the Decision Tree Classifier
dt_classifier.fit(X_train_tfidf, y_train)
gb_classifier.fit(X_train_tfidf, y_train)
ada_classifier.fit(X_train_tfidf, y_train)
svm_classifier_tuned.fit(X_train_tfidf, y_train)
# Decision Tree Classifier
print("Decision Tree Predictions:")
for i in range(10):
    sentence = X_test[i]
    prediction = dt_classifier.predict(tfidf_vectorizer.transform([sentence]))[0]
    print(f"Sentence {i+1}: {sentence} - Predicted Severity: {prediction}")

# Gradient Boosting Classifier
print("\nGradient Boosting Predictions:")
for i in range(10):
    sentence = X_test[i]
    prediction = gb_classifier.predict(tfidf_vectorizer.transform([sentence]))[0]
    print(f"Sentence {i+1}: {sentence} - Predicted Severity: {prediction}")

# AdaBoost Classifier
print("\nAdaBoost Predictions:")
for i in range(10):
    sentence = X_test[i]
    prediction = ada_classifier.predict(tfidf_vectorizer.transform([sentence]))[0]
    print(f"Sentence {i+1}: {sentence} - Predicted Severity: {prediction}")

# Support Vector Machine Classifier
print("\nSupport Vector Machine Predictions:")
for i in range(10):
    sentence = X_test[i]
    prediction = svm_classifier_tuned.predict(tfidf_vectorizer.transform([sentence]))[0]
    print(f"Sentence {i+1}: {sentence} - Predicted Severity: {prediction}")



Decision Tree Predictions:
Sentence 1: chills sore throat headache I was Covid two weeks ago had a - Predicted Severity: Not Severe
Sentence 2: shortness of breath chest pain So I have the and - Predicted Severity: Severe
Sentence 3: chest pain Ive had other random symptoms come and go in the past month too stiff neck drowsiness wheezing also still persists headaches primarily - Predicted Severity: Not Severe
Sentence 4: cough fatigue sore throat I am on day 18 fully vaccinated and boosted still having difficulty with sinusitis a wicked nonproductive croupy - Predicted Severity: Not Severe
Sentence 5: shortness of breath fever chills sore throat nausea Last week I fell really ill with a bad and body ache My lasted from Tuesday to - Predicted Severity: Severe
Sentence 6: cough fever sore throat Congestion So far symptoms have been pretty mild in - Predicted Severity: Not Severe
Sentence 7: fatigue Super body hurts all over congested and coughing up junk from my throatOh taste smell are 

ACCURACY , CLASSIFICATION REPORT FOR SOCIAL MEDIA DATASET (NER TAG BASED SEVERITY)

In [None]:
import warnings
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the TSV dataset
df = pd.read_excel("/content/severity_predictions (4).xlsx")

# Preprocess the data
def preprocess_data(data):
    # Drop any NaN values
    data.dropna(inplace=True)
    X = data["Sentence"].tolist()
    y = data["Severity"].tolist()
    return X, y

# Preprocess the data
X, y = preprocess_data(df)

# Limiting the amount of training data to reduce overfitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train[:int(len(X_train)*0.5)]
y_train = y_train[:int(len(y_train)*0.5)]

# Convert y_train to a NumPy array
y_train = np.array(y_train)

# Use TF-IDF to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define custom StratifiedKFold with minimum one instance of each class in each fold
class CustomStratifiedKFold(StratifiedKFold):
    def __init__(self, n_splits=10, shuffle=False, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        for train_index, test_index in super().split(X, y, groups):
            unique_classes, counts = np.unique(y[train_index], return_counts=True)
            if np.all(counts >= 1):
                yield train_index, test_index
            else:
                # In case a fold doesn't have at least one instance of each class,
                # yield a split with all instances in the training set
                yield np.concatenate((train_index, test_index)), []

# Define cross-validation strategy
cv = CustomStratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(max_depth=6)
dt_predictions = cross_val_predict(dt_classifier, X_train_tfidf, y_train, cv=cv)
dt_precision = precision_score(y_train, dt_predictions, average='weighted')
dt_recall = recall_score(y_train, dt_predictions, average='weighted')
dt_f1_score = f1_score(y_train, dt_predictions, average='weighted')
dt_accuracy = accuracy_score(y_train, dt_predictions)
dt_report = classification_report(y_train, dt_predictions)
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree F1-score:", dt_f1_score)
print("Decision Tree Accuracy:", dt_accuracy)


# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(max_depth=4, n_estimators=120)
gb_predictions = cross_val_predict(gb_classifier, X_train_tfidf, y_train, cv=cv)
gb_precision = precision_score(y_train, gb_predictions, average='weighted')
gb_recall = recall_score(y_train, gb_predictions, average='weighted')
gb_f1_score = f1_score(y_train, gb_predictions, average='weighted')
gb_accuracy = accuracy_score(y_train, gb_predictions)
gb_report = classification_report(y_train, gb_predictions)
print("Gradient Boosting Precision:", gb_precision)
print("Gradient Boosting Recall:", gb_recall)
print("Gradient Boosting F1-score:", gb_f1_score)
print("Gradient Boosting Accuracy:", gb_accuracy)


# AdaBoost Classifier
ada_classifier = AdaBoostClassifier(n_estimators=60, learning_rate=0.05)
ada_predictions = cross_val_predict(ada_classifier, X_train_tfidf, y_train, cv=cv)
ada_precision = precision_score(y_train, ada_predictions, average='weighted')
ada_recall = recall_score(y_train, ada_predictions, average='weighted')
ada_f1_score = f1_score(y_train, ada_predictions, average='weighted')
ada_accuracy = accuracy_score(y_train, ada_predictions)
ada_report = classification_report(y_train, ada_predictions)
print("AdaBoost Precision:", ada_precision)
print("AdaBoost Recall:", ada_recall)
print("AdaBoost F1-score:", ada_f1_score)
print("AdaBoost Accuracy:", ada_accuracy)


# Support Vector Machine Classifier
svm_classifier_tuned = SVC(C=10, kernel='linear', gamma='scale')
svm_predictions = cross_val_predict(svm_classifier_tuned, X_train_tfidf, y_train, cv=cv)
svm_precision = precision_score(y_train, svm_predictions, average='weighted')
svm_recall = recall_score(y_train, svm_predictions, average='weighted')
svm_f1_score = f1_score(y_train, svm_predictions, average='weighted')
svm_accuracy = accuracy_score(y_train, svm_predictions)
svm_report = classification_report(y_train, svm_predictions)
print("Support Vector Machine Precision:", svm_precision)
print("Support Vector Machine Recall:", svm_recall)
print("Support Vector Machine F1-score:", svm_f1_score)
print("Support Vector Machine Accuracy:", svm_accuracy)


Decision Tree Precision: 0.9387799564270153
Decision Tree Recall: 0.9411764705882353
Decision Tree F1-score: 0.9392444964156796
Decision Tree Accuracy: 0.9411764705882353
Gradient Boosting Precision: 0.9624893435635125
Gradient Boosting Recall: 0.9607843137254902
Gradient Boosting F1-score: 0.9579520697167756
Gradient Boosting Accuracy: 0.9607843137254902
AdaBoost Precision: 0.9624893435635125
AdaBoost Recall: 0.9607843137254902
AdaBoost F1-score: 0.9579520697167756
AdaBoost Accuracy: 0.9607843137254902
Support Vector Machine Precision: 0.9281045751633986
Support Vector Machine Recall: 0.9215686274509803
Support Vector Machine F1-score: 0.9075873827791987
Support Vector Machine Accuracy: 0.9215686274509803


Labelling of Severity based on N gram

In [None]:
import pandas as pd

# Function to read sentences with their final tags from a file
def read_sentences_with_final_tags(file_path):
    sentences = []
    with open(file_path, 'r') as f:
        sentence = []
        for line in f:
            line = line.strip().
            if line:  # Non-empty line
                parts = line.split('\t')
                if len(parts) == 2:  # Ensure the line has the expected format
                    word, final_tag = parts
                    sentence.append((word, final_tag))
            else:  # Empty line indicates end of sentence
                if sentence:
                    sentences.append(sentence)
                    sentence = []
    return sentences

# Function to count the number of symptom words based on tags for n-grams
def count_symptoms_ngram(ngram_with_tags, n):
    if n == 1:
        return sum(1 for word, tag in ngram_with_tags if tag == 'B')
    elif n == 2:
        return sum(1 for word, tag in ngram_with_tags if tag == 'B' or tag == 'E')
    elif n == 3:
        return sum(1 for word, tag in ngram_with_tags if tag == 'B' or tag == 'I' or tag == 'E')

# Function to create n-grams from a list of words
def generate_ngrams(words, n):
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = words[i:i+n]
        ngrams.append(ngram)
    return ngrams

# Function to extract n-grams with their final tags
def extract_ngrams_with_tags(sentences_with_final_tags, n):
    ngrams_with_tags = []
    for sentence_with_final_tags in sentences_with_final_tags:
        words, tags = zip(*sentence_with_final_tags)
        ngrams = generate_ngrams(words, n)
        tags_ngrams = generate_ngrams(tags, n)
        for ngram, tag_ngram in zip(ngrams, tags_ngrams):
            ngram_with_tags = [(word, tag) for word, tag in zip(ngram, tag_ngram)]
            ngrams_with_tags.append(ngram_with_tags)
    return ngrams_with_tags

# Function to predict severity based on the count of symptom words in n-grams
def predict_severity_ngram(sentences_with_final_tags, n):
    severity_predictions = []

    for sentence_with_final_tags in sentences_with_final_tags:
        ngrams_with_tags = extract_ngrams_with_tags([sentence_with_final_tags], n)
        symptom_count = count_symptoms_ngram(ngrams_with_tags[0], n)
        if symptom_count >= 3:
            severity = 'Severe'
        else:
            severity = 'Not Severe'
        severity_predictions.append((sentence_with_final_tags, severity))

    return severity_predictions

# Read sentences and final tags from the file
sentences_with_final_tags = read_sentences_with_final_tags('/content/final_tags.txt')

# Predict severity based on the count of symptom words for each n-gram
for n in range(1, 4):
    severity_predictions = predict_severity_ngram(sentences_with_final_tags, n)

    # Store the results in a DataFrame
    results_df = pd.DataFrame(columns=['Sentence', 'Severity'])
    for i, (sentence_with_final_tags, severity) in enumerate(severity_predictions):
        sentence = ' '.join(word for word, _ in sentence_with_final_tags)
        results_df.loc[i] = [sentence, severity]

    # Write the DataFrame to an Excel file
    results_df.to_excel(f'/content/severity_predictions_{n}gram.xlsx', index=False)



PERFORMANCE EVALUVATION OF 2 GRAM

In [None]:
!python --version

Python 3.10.12


In [None]:
3import warnings
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict,train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Load the TSV dataset
df = pd.read_excel("/content/severity_predictions_2gram.xlsx")

# Preprocess the data
def preprocess_data(data):
    # Drop any NaN values
    data.dropna(inplace=True)
    X = data["Sentence"].tolist()
    y = data["Severity"].tolist()
    return X, y

# Preprocess the data
X, y = preprocess_data(df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train[:int(len(X_train)*0.5)]
y_train = y_train[:int(len(y_train)*0.5)]

# Convert y_train to a NumPy array
y_train = np.array(y_train)

# Use TF-IDF to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define custom StratifiedKFold with minimum one instance of each class in each fold
class CustomStratifiedKFold(StratifiedKFold):
    def __init__(self, n_splits=10, shuffle=False, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        for train_index, test_index in super().split(X, y, groups):
            unique_classes, counts = np.unique(y[train_index], return_counts=True)
            if np.all(counts >= 1):
                yield train_index, test_index
            else:
                # In case a fold doesn't have at least one instance of each class,
                # yield a split with all instances in the training set
                yield np.concatenate((train_index, test_index)), []

# Define cross-validation strategy
cv = CustomStratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(max_depth=6)
dt_predictions = cross_val_predict(dt_classifier, X_train_tfidf, y_train, cv=cv)
dt_precision = precision_score(y_train, dt_predictions, average='weighted')
dt_recall = recall_score(y_train, dt_predictions, average='weighted')
dt_f1_score = f1_score(y_train, dt_predictions, average='weighted')
dt_accuracy = accuracy_score(y_train, dt_predictions)
dt_report = classification_report(y_train, dt_predictions)
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree F1-score:", dt_f1_score)
print("Decision Tree Accuracy:", dt_accuracy)


# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(max_depth=4, n_estimators=120)
gb_predictions = cross_val_predict(gb_classifier, X_train_tfidf, y_train, cv=cv)
gb_precision = precision_score(y_train, gb_predictions, average='weighted')
gb_recall = recall_score(y_train, gb_predictions, average='weighted')
gb_f1_score = f1_score(y_train, gb_predictions, average='weighted')
gb_accuracy = accuracy_score(y_train, gb_predictions)
gb_report = classification_report(y_train, gb_predictions)
print("Gradient Boosting Precision:", gb_precision)
print("Gradient Boosting Recall:", gb_recall)
print("Gradient Boosting F1-score:", gb_f1_score)
print("Gradient Boosting Accuracy:", gb_accuracy)


# AdaBoost Classifier
ada_classifier = AdaBoostClassifier(n_estimators=60, learning_rate=0.05)
ada_predictions = cross_val_predict(ada_classifier, X_train_tfidf, y_train, cv=cv)
ada_precision = precision_score(y_train, ada_predictions, average='weighted')
ada_recall = recall_score(y_train, ada_predictions, average='weighted')
ada_f1_score = f1_score(y_train, ada_predictions, average='weighted')
ada_accuracy = accuracy_score(y_train, ada_predictions)
ada_report = classification_report(y_train, ada_predictions)
print("AdaBoost Precision:", ada_precision)
print("AdaBoost Recall:", ada_recall)
print("AdaBoost F1-score:", ada_f1_score)
print("AdaBoost Accuracy:", ada_accuracy)


# Support Vector Machine Classifier
svm_classifier_tuned = SVC(C=10, kernel='linear', gamma='scale')
svm_predictions = cross_val_predict(svm_classifier_tuned, X_train_tfidf, y_train, cv=cv)
svm_precision = precision_score(y_train, svm_predictions, average='weighted')
svm_recall = recall_score(y_train, svm_predictions, average='weighted')
svm_f1_score = f1_score(y_train, svm_predictions, average='weighted')
svm_accuracy = accuracy_score(y_train, svm_predictions)
svm_report = classification_report(y_train, svm_predictions)
print("Support Vector Machine Precision:", svm_precision)
print("Support Vector Machine Recall:", svm_recall)
print("Support Vector Machine F1-score:", svm_f1_score)
print("Support Vector Machine Accuracy:", svm_accuracy)

Decision Tree Precision: 0.6169696969696971
Decision Tree Recall: 0.68
Decision Tree F1-score: 0.632
Decision Tree Accuracy: 0.68
Gradient Boosting Precision: 0.51
Gradient Boosting Recall: 0.68
Gradient Boosting F1-score: 0.582857142857143
Gradient Boosting Accuracy: 0.68
AdaBoost Precision: 0.49090909090909085
AdaBoost Recall: 0.6
AdaBoost F1-score: 0.5399999999999999
AdaBoost Accuracy: 0.6
Support Vector Machine Precision: 0.5184
Support Vector Machine Recall: 0.72
Support Vector Machine F1-score: 0.6027906976744185
Support Vector Machine Accuracy: 0.72


PERFORMANCE EVALUVATION OF 3 GRAM

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

# Load the TSV dataset
df = pd.read_excel("/content/severity_predictions_3gram.xlsx")

# Preprocess the data
def preprocess_data(data):
    # Drop any NaN values
    data.dropna(inplace=True)
    X = data["Sentence"].tolist()
    y = data["Severity"].tolist()
    return X, y

# Preprocess the data
X, y = preprocess_data(df)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train[:int(len(X_train)*0.5)]
y_train = y_train[:int(len(y_train)*0.5)]  # Correspondingly reduce labels

# Convert y_train to a NumPy array
y_train = np.array(y_train)

# Use TF-IDF to convert text data into numerical features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define custom StratifiedKFold with minimum one instance of each class in each fold
class CustomStratifiedKFold(StratifiedKFold):
    def __init__(self, n_splits=10, shuffle=False, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        for train_index, test_index in super().split(X, y, groups):
            unique_classes, counts = np.unique(y[train_index], return_counts=True)
            if np.all(counts >= 1):
                yield train_index, test_index
            else:
                # In case a fold doesn't have at least one instance of each class,
                # yield a split with all instances in the training set
                yield np.concatenate((train_index, test_index)), []

# Define cross-validation strategy
cv = CustomStratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(max_depth=6)
dt_predictions = cross_val_predict(dt_classifier, X_train_tfidf, y_train, cv=cv)
dt_precision = precision_score(y_train, dt_predictions, average='weighted')
dt_recall = recall_score(y_train, dt_predictions, average='weighted')
dt_f1_score = f1_score(y_train, dt_predictions, average='weighted')
dt_accuracy = accuracy_score(y_train, dt_predictions)
dt_report = classification_report(y_train, dt_predictions)
print("Decision Tree Precision:", dt_precision)
print("Decision Tree Recall:", dt_recall)
print("Decision Tree F1-score:", dt_f1_score)
print("Decision Tree Accuracy:", dt_accuracy)


# Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(max_depth=4, n_estimators=120)
gb_predictions = cross_val_predict(gb_classifier, X_train_tfidf, y_train, cv=cv)
gb_precision = precision_score(y_train, gb_predictions, average='weighted')
gb_recall = recall_score(y_train, gb_predictions, average='weighted')
gb_f1_score = f1_score(y_train, gb_predictions, average='weighted')
gb_accuracy = accuracy_score(y_train, gb_predictions)
gb_report = classification_report(y_train, gb_predictions)
print("Gradient Boosting Precision:", gb_precision)
print("Gradient Boosting Recall:", gb_recall)
print("Gradient Boosting F1-score:", gb_f1_score)
print("Gradient Boosting Accuracy:", gb_accuracy)


# AdaBoost Classifier
ada_classifier = AdaBoostClassifier(n_estimators=60, learning_rate=0.05)
ada_predictions = cross_val_predict(ada_classifier, X_train_tfidf, y_train, cv=cv)
ada_precision = precision_score(y_train, ada_predictions, average='weighted')
ada_recall = recall_score(y_train, ada_predictions, average='weighted')
ada_f1_score = f1_score(y_train, ada_predictions, average='weighted')
ada_accuracy = accuracy_score(y_train, ada_predictions)
ada_report = classification_report(y_train, ada_predictions)
print("AdaBoost Precision:", ada_precision)
print("AdaBoost Recall:", ada_recall)
print("AdaBoost F1-score:", ada_f1_score)
print("AdaBoost Accuracy:", ada_accuracy)


# Support Vector Machine Classifier
svm_classifier_tuned = SVC(C=10, kernel='linear', gamma='scale')
svm_predictions = cross_val_predict(svm_classifier_tuned, X_train_tfidf, y_train, cv=cv)
svm_precision = precision_score(y_train, svm_predictions, average='weighted')
svm_recall = recall_score(y_train, svm_predictions, average='weighted')
svm_f1_score = f1_score(y_train, svm_predictions, average='weighted')
svm_accuracy = accuracy_score(y_train, svm_predictions)
svm_report = classification_report(y_train, svm_predictions)
print("Support Vector Machine Precision:", svm_precision)
print("Support Vector Machine Recall:", svm_recall)
print("Support Vector Machine F1-score:", svm_f1_score)
print("Support Vector Machine Accuracy:", svm_accuracy)



Decision Tree Precision: 0.8678260869565217
Decision Tree Recall: 0.84
Decision Tree F1-score: 0.8076190476190476
Decision Tree Accuracy: 0.84
Gradient Boosting Precision: 0.8416666666666666
Gradient Boosting Recall: 0.8
Gradient Boosting F1-score: 0.7401993355481729
Gradient Boosting Accuracy: 0.8
AdaBoost Precision: 0.5617391304347826
AdaBoost Recall: 0.68
AdaBoost F1-score: 0.6152380952380953
AdaBoost Accuracy: 0.68
Support Vector Machine Precision: 0.57
Support Vector Machine Recall: 0.72
Support Vector Machine F1-score: 0.636279069767442
Support Vector Machine Accuracy: 0.72
