<a href="https://colab.research.google.com/github/nanditha-varma/nlphackathon/blob/main/nlphackathonf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import string
import torch
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.tree import DecisionTreeClassifier

**Necessary NLTK resources**

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


**Loading Dataset**

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [4]:
from datasets import load_dataset

ds = load_dataset("ade-benchmark-corpus/ade_corpus_v2", "Ade_corpus_v2_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23516 [00:00<?, ? examples/s]

In [5]:
df = pd.DataFrame(ds['train'])
df.head()

Unnamed: 0,text,label
0,Intravenous azithromycin-induced ototoxicity.,1
1,"Immobilization, while Paget's bone disease was...",1
2,Unaccountable severe hypercalcemia in a patien...,1
3,METHODS: We report two cases of pseudoporphyri...,1
4,METHODS: We report two cases of pseudoporphyri...,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23516 entries, 0 to 23515
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    23516 non-null  object
 1   label   23516 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 367.6+ KB


**Data Preprocessing**

In [7]:
# Handling missing values
df['text'].fillna("No Text Provided", inplace=True)  # Replace missing text with "No Text Provided"
df['label'].fillna(df['label'].mode()[0], inplace=True)  # Replace missing labels with most frequent label

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# 1. Lowercasing
df['lowercased_text'] = df['text'].str.lower()
print("After Lowercasing:\n", df[['text', 'lowercased_text']].head())

# 2. Removing special characters and punctuation
df['cleaned_text'] = df['lowercased_text'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))
print("\nAfter Removing Special Characters & Punctuation:\n", df[['lowercased_text', 'cleaned_text']].head())

# 3. Tokenization
df['tokenized_text'] = df['cleaned_text'].apply(word_tokenize)
print("\nAfter Tokenization:\n", df[['cleaned_text', 'tokenized_text']].head())

# 4. Stopword Removal
df['stopword_removed_text'] = df['tokenized_text'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
print("\nAfter Stopword Removal:\n", df[['tokenized_text', 'stopword_removed_text']].head())

# 5. Lemmatization
df['lemmatized_text'] = df['stopword_removed_text'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
df['final_clean_text'] = df['lemmatized_text'].apply(lambda tokens: ' '.join(tokens))
print("\nAfter Lemmatization:\n", df[['stopword_removed_text', 'lemmatized_text']].head())

# Display the preprocessed dataset
# Instead of ace_tools, use IPython's display
print("Preprocessed ADE Corpus V2 Dataset:")  # Added a descriptive print statement
display(df) # Use display(df) to show the dataframe

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna("No Text Provided", inplace=True)  # Replace missing text with "No Text Provided"
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['label'].fillna(df['label'].mode()[0], inplace=True)  # Replace missing labels with most frequent label


After Lowercasing:
                                                 text  \
0      Intravenous azithromycin-induced ototoxicity.   
1  Immobilization, while Paget's bone disease was...   
2  Unaccountable severe hypercalcemia in a patien...   
3  METHODS: We report two cases of pseudoporphyri...   
4  METHODS: We report two cases of pseudoporphyri...   

                                     lowercased_text  
0      intravenous azithromycin-induced ototoxicity.  
1  immobilization, while paget's bone disease was...  
2  unaccountable severe hypercalcemia in a patien...  
3  methods: we report two cases of pseudoporphyri...  
4  methods: we report two cases of pseudoporphyri...  

After Removing Special Characters & Punctuation:
                                      lowercased_text  \
0      intravenous azithromycin-induced ototoxicity.   
1  immobilization, while paget's bone disease was...   
2  unaccountable severe hypercalcemia in a patien...   
3  methods: we report two cases of pse

Unnamed: 0,text,label,lowercased_text,cleaned_text,tokenized_text,stopword_removed_text,lemmatized_text,final_clean_text
0,Intravenous azithromycin-induced ototoxicity.,1,intravenous azithromycin-induced ototoxicity.,intravenous azithromycininduced ototoxicity,"[intravenous, azithromycininduced, ototoxicity]","[intravenous, azithromycininduced, ototoxicity]","[intravenous, azithromycininduced, ototoxicity]",intravenous azithromycininduced ototoxicity
1,"Immobilization, while Paget's bone disease was...",1,"immobilization, while paget's bone disease was...",immobilization while pagets bone disease was p...,"[immobilization, while, pagets, bone, disease,...","[immobilization, pagets, bone, disease, presen...","[immobilization, paget, bone, disease, present...",immobilization paget bone disease present perh...
2,Unaccountable severe hypercalcemia in a patien...,1,unaccountable severe hypercalcemia in a patien...,unaccountable severe hypercalcemia in a patien...,"[unaccountable, severe, hypercalcemia, in, a, ...","[unaccountable, severe, hypercalcemia, patient...","[unaccountable, severe, hypercalcemia, patient...",unaccountable severe hypercalcemia patient tre...
3,METHODS: We report two cases of pseudoporphyri...,1,methods: we report two cases of pseudoporphyri...,methods we report two cases of pseudoporphyria...,"[methods, we, report, two, cases, of, pseudopo...","[methods, report, two, cases, pseudoporphyria,...","[method, report, two, case, pseudoporphyria, c...",method report two case pseudoporphyria caused ...
4,METHODS: We report two cases of pseudoporphyri...,1,methods: we report two cases of pseudoporphyri...,methods we report two cases of pseudoporphyria...,"[methods, we, report, two, cases, of, pseudopo...","[methods, report, two, cases, pseudoporphyria,...","[method, report, two, case, pseudoporphyria, c...",method report two case pseudoporphyria caused ...
...,...,...,...,...,...,...,...,...
23511,"At autopsy, the liver was found to be small, s...",0,"at autopsy, the liver was found to be small, s...",at autopsy the liver was found to be small shr...,"[at, autopsy, the, liver, was, found, to, be, ...","[autopsy, liver, found, small, shrunken, scarr...","[autopsy, liver, found, small, shrunken, scarr...",autopsy liver found small shrunken scarred his...
23512,"Physical exam revealed a patient with aphasia,...",0,"physical exam revealed a patient with aphasia,...",physical exam revealed a patient with aphasia ...,"[physical, exam, revealed, a, patient, with, a...","[physical, exam, revealed, patient, aphasia, t...","[physical, exam, revealed, patient, aphasia, t...",physical exam revealed patient aphasia tremor ...
23513,At the time when the leukemia appeared seven o...,0,at the time when the leukemia appeared seven o...,at the time when the leukemia appeared seven o...,"[at, the, time, when, the, leukemia, appeared,...","[time, leukemia, appeared, seven, patients, co...","[time, leukemia, appeared, seven, patient, com...",time leukemia appeared seven patient complete ...
23514,The American Society for Regional Anesthesia a...,0,the american society for regional anesthesia a...,the american society for regional anesthesia a...,"[the, american, society, for, regional, anesth...","[american, society, regional, anesthesia, pain...","[american, society, regional, anesthesia, pain...",american society regional anesthesia pain medi...


**BOW AND** **TDIDF**

In [11]:
from sklearn.feature_extraction.text import CountVectorizer # Import CountVectorizer

vectorizer = CountVectorizer(max_features=5000, min_df=1, binary=True)
X_bow = vectorizer.fit_transform(df["final_clean_text"])
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

print("\nBinary BoW Representation:")
display(bow_df.head())



Binary BoW Representation:


Unnamed: 0,aa,abated,abdomen,abdominal,ability,ablation,able,abnormal,abnormality,abnormally,...,zafirlukast,zidovudine,ziprasidone,zoledronic,zolpidem,zomepirac,zone,zonisamide,zoster,zygomycosis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=2)  # Using the cleaned vocabulary
X_tfidf = tfidf_vectorizer.fit_transform(df["final_clean_text"])
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Representation After Fixing:")
display(tfidf_df.head())



TF-IDF Representation After Fixing:


Unnamed: 0,aa,abated,abdomen,abdominal,ability,able,abnormal,abnormality,abnormally,abortion,...,zafirlukast,zidovudine,ziprasidone,zoledronic,zolpidem,zomepirac,zone,zonisamide,zoster,zygomycosis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Word embedding** -Glove

In [13]:
pip install gensim


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [14]:
import gensim.downloader as api

# Load pre-trained GloVe embeddings (100-dimensional vectors)
glove_model = api.load("glove-wiki-gigaword-100")  # You can change to "glove-wiki-gigaword-300" for 300D




In [16]:
import numpy as np

# Function to convert words into GloVe vectors
def get_embedding(text):
    words = text.split()
    word_vectors = [glove_model[word] for word in words if word in glove_model]  # Get vectors for each word
    if len(word_vectors) == 0:
        return np.zeros(100)  # If no word has a vector, return zero vector
    return np.mean(word_vectors, axis=0)  # Compute the average vector for the sentence

# Apply embedding function to dataset
df["glove_embedding"] = df["final_clean_text"].apply(get_embedding)


In [17]:
# Convert list of arrays into a NumPy matrix
glove_matrix = np.vstack(df["glove_embedding"].values)

# Print shape of the embeddings matrix
print("Shape of Word Embeddings Matrix:", glove_matrix.shape)


Shape of Word Embeddings Matrix: (23516, 100)


In [18]:
# Convert list of arrays into a NumPy matrix
glove_matrix = np.vstack(df["glove_embedding"].values)

# Print shape of the embeddings matrix
print("Shape of Word Embeddings Matrix:", glove_matrix.shape)


Shape of Word Embeddings Matrix: (23516, 100)


In [19]:
np.save("glove_word_embeddings.npy", glove_matrix)  # Save embeddings as NumPy file
df.to_csv("dataset_with_glove.csv", index=False)  # Save dataset with embeddings
print("Word Embeddings Saved Successfully!")
df.head(10)


Word Embeddings Saved Successfully!


Unnamed: 0,text,label,lowercased_text,cleaned_text,tokenized_text,stopword_removed_text,lemmatized_text,final_clean_text,glove_embedding
0,Intravenous azithromycin-induced ototoxicity.,1,intravenous azithromycin-induced ototoxicity.,intravenous azithromycininduced ototoxicity,"[intravenous, azithromycininduced, ototoxicity]","[intravenous, azithromycininduced, ototoxicity]","[intravenous, azithromycininduced, ototoxicity]",intravenous azithromycininduced ototoxicity,"[0.62649, 0.59024, -0.46752, -0.12197, 0.02518..."
1,"Immobilization, while Paget's bone disease was...",1,"immobilization, while paget's bone disease was...",immobilization while pagets bone disease was p...,"[immobilization, while, pagets, bone, disease,...","[immobilization, pagets, bone, disease, presen...","[immobilization, paget, bone, disease, present...",immobilization paget bone disease present perh...,"[-0.10438116, 0.15415515, -0.034131918, -0.013..."
2,Unaccountable severe hypercalcemia in a patien...,1,unaccountable severe hypercalcemia in a patien...,unaccountable severe hypercalcemia in a patien...,"[unaccountable, severe, hypercalcemia, in, a, ...","[unaccountable, severe, hypercalcemia, patient...","[unaccountable, severe, hypercalcemia, patient...",unaccountable severe hypercalcemia patient tre...,"[-0.13617, 0.11925101, 0.21940498, 0.16925499,..."
3,METHODS: We report two cases of pseudoporphyri...,1,methods: we report two cases of pseudoporphyri...,methods we report two cases of pseudoporphyria...,"[methods, we, report, two, cases, of, pseudopo...","[methods, report, two, cases, pseudoporphyria,...","[method, report, two, case, pseudoporphyria, c...",method report two case pseudoporphyria caused ...,"[-0.099353336, 0.1599615, 0.008972635, -0.1410..."
4,METHODS: We report two cases of pseudoporphyri...,1,methods: we report two cases of pseudoporphyri...,methods we report two cases of pseudoporphyria...,"[methods, we, report, two, cases, of, pseudopo...","[methods, report, two, cases, pseudoporphyria,...","[method, report, two, case, pseudoporphyria, c...",method report two case pseudoporphyria caused ...,"[-0.099353336, 0.1599615, 0.008972635, -0.1410..."
5,"Naproxen, the most common offender, has been a...",1,"naproxen, the most common offender, has been a...",naproxen the most common offender has been ass...,"[naproxen, the, most, common, offender, has, b...","[naproxen, common, offender, associated, dimor...","[naproxen, common, offender, associated, dimor...",naproxen common offender associated dimorphic ...,"[-0.020224743, 0.41874897, -0.01997984, 0.1261..."
6,RESULTS: A 44-year-old man taking naproxen for...,1,results: a 44-year-old man taking naproxen for...,results a yearold man taking naproxen for chro...,"[results, a, yearold, man, taking, naproxen, f...","[results, yearold, man, taking, naproxen, chro...","[result, yearold, man, taking, naproxen, chron...",result yearold man taking naproxen chronic low...,"[0.01188874, 0.2260408, 0.16673952, -0.0494708..."
7,RESULTS: A 44-year-old man taking naproxen for...,1,results: a 44-year-old man taking naproxen for...,results a yearold man taking naproxen for chro...,"[results, a, yearold, man, taking, naproxen, f...","[results, yearold, man, taking, naproxen, chro...","[result, yearold, man, taking, naproxen, chron...",result yearold man taking naproxen chronic low...,"[0.01188874, 0.2260408, 0.16673952, -0.0494708..."
8,RESULTS: A 44-year-old man taking naproxen for...,1,results: a 44-year-old man taking naproxen for...,results a yearold man taking naproxen for chro...,"[results, a, yearold, man, taking, naproxen, f...","[results, yearold, man, taking, naproxen, chro...","[result, yearold, man, taking, naproxen, chron...",result yearold man taking naproxen chronic low...,"[0.01188874, 0.2260408, 0.16673952, -0.0494708..."
9,RESULTS: A 44-year-old man taking naproxen for...,1,results: a 44-year-old man taking naproxen for...,results a yearold man taking naproxen for chro...,"[results, a, yearold, man, taking, naproxen, f...","[results, yearold, man, taking, naproxen, chro...","[result, yearold, man, taking, naproxen, chron...",result yearold man taking naproxen chronic low...,"[0.01188874, 0.2260408, 0.16673952, -0.0494708..."


**Word2vec**

In [21]:
import gensim
import nltk
nltk.download("punkt")

# Tokenize each sentence into words
df["tokenized_text"] = df["final_clean_text"].apply(nltk.word_tokenize)

# Print first few tokenized sentences
print(df["tokenized_text"].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0      [intravenous, azithromycininduced, ototoxicity]
1    [immobilization, paget, bone, disease, present...
2    [unaccountable, severe, hypercalcemia, patient...
3    [method, report, two, case, pseudoporphyria, c...
4    [method, report, two, case, pseudoporphyria, c...
Name: tokenized_text, dtype: object


In [22]:
from gensim.models import Word2Vec

# Train Word2Vec model (CBOW architecture)
w2v_model = Word2Vec(sentences=df["tokenized_text"], vector_size=100, window=5, min_count=2, workers=4)

# Save the model for later use
w2v_model.save("word2vec_model.bin")
print("Word2Vec Model Trained and Saved Successfully!")


Word2Vec Model Trained and Saved Successfully!


In [23]:
import numpy as np

# Function to get Word2Vec vectors
def get_w2v_embedding(text):
    words = text.split()
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]  # Get vectors for each word
    if len(word_vectors) == 0:
        return np.zeros(100)  # Return a zero vector if no word matches
    return np.mean(word_vectors, axis=0)  # Compute average vector for the sentence

# Apply function to dataset
df["word2vec_embedding"] = df["final_clean_text"].apply(get_w2v_embedding)


In [24]:
# Convert list of arrays into a NumPy matrix
w2v_matrix = np.vstack(df["word2vec_embedding"].values)

# Print shape of the embeddings matrix
print("Shape of Word2Vec Embeddings Matrix:", w2v_matrix.shape)
np.save("word2vec_embeddings.npy", w2v_matrix)  # Save embeddings as NumPy file
df.to_csv("dataset_with_word2vec.csv", index=False)  # Save dataset with embeddings
print("Word2Vec Embeddings Saved Successfully!")
print(df.head())


Shape of Word2Vec Embeddings Matrix: (23516, 100)
Word2Vec Embeddings Saved Successfully!
                                                text  label  \
0      Intravenous azithromycin-induced ototoxicity.      1   
1  Immobilization, while Paget's bone disease was...      1   
2  Unaccountable severe hypercalcemia in a patien...      1   
3  METHODS: We report two cases of pseudoporphyri...      1   
4  METHODS: We report two cases of pseudoporphyri...      1   

                                     lowercased_text  \
0      intravenous azithromycin-induced ototoxicity.   
1  immobilization, while paget's bone disease was...   
2  unaccountable severe hypercalcemia in a patien...   
3  methods: we report two cases of pseudoporphyri...   
4  methods: we report two cases of pseudoporphyri...   

                                        cleaned_text  \
0        intravenous azithromycininduced ototoxicity   
1  immobilization while pagets bone disease was p...   
2  unaccountable severe hy

**Machine Learning Models **

In [25]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [32]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["final_clean_text"], df["label"], test_size=0.3, random_state=42)

# Create and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Unigram + Bigram
X_train_tfidf = vectorizer.fit_transform(X_train) # Fit on the text data
X_test_tfidf = vectorizer.transform(X_test)       # Transform the test data

**Using TF-IDF**

In [35]:
ml_results_tfidf= []
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    ml_results_tfidf.append([name, accuracy, precision, recall, f1])



Parameters: { "use_label_encoder" } are not used.



In [37]:
df_results_tfidf = pd.DataFrame(ml_results_tfidf, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Printing the DataFrame
print(df_results_tfidf)


                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.838412   0.811888  0.571358  0.670711
1          Naive Bayes  0.826931   0.964490  0.414370  0.579690
2                  SVM  0.884763   0.897586  0.677165  0.771950
3        Decision Tree  0.819277   0.674023  0.721457  0.696934
4        Random Forest  0.870588   0.933385  0.593012  0.725248
5              XGBoost  0.834444   0.835404  0.529528  0.648193


**Using bag of words**

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 2)) # Unigram and Bigram
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [40]:
ml_results_bow = []
for name, model in models.items():
    model.fit(X_train_bow, y_train)
    y_pred = model.predict(X_test_bow)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    ml_results_bow.append([name, accuracy, precision, recall, f1])


Parameters: { "use_label_encoder" } are not used.



In [41]:
df_results_bow = pd.DataFrame(ml_results_bow, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Printing the DataFrame
print(df_results_bow)

                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.888448   0.859619  0.732283  0.790858
1          Naive Bayes  0.867612   0.765988  0.778051  0.771973
2                  SVM  0.877108   0.909923  0.636319  0.748914
3        Decision Tree  0.841531   0.736054  0.701280  0.718246
4        Random Forest  0.867186   0.955870  0.564961  0.710176
5              XGBoost  0.827782   0.811594  0.523622  0.636554


**DL Models**

In [42]:
pip install transformers datasets torch scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [43]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [50]:
df["label"] = df["label"].astype(int)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["final_clean_text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [51]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Convert to PyTorch dataset
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)


In [52]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(df["label"])))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
training_args = TrainingArguments(
    output_dir="./bert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True,
    report_to="none"  # ✅ This disables wandb tracking
)





In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {
        "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        "precision": precision_score(p.label_ids, np.argmax(p.predictions, axis=1), average="weighted"),
        "recall": recall_score(p.label_ids, np.argmax(p.predictions, axis=1), average="weighted"),
        "f1": f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average="weighted"),
    },
)

In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3385,0.276479,0.915391,0.916289,0.915391,0.915757
2,0.1988,0.249439,0.935587,0.938768,0.935587,0.936389
3,0.1185,0.268155,0.94324,0.944005,0.94324,0.943513


TrainOutput(global_step=7056, training_loss=0.2446178362753386, metrics={'train_runtime': 1493.0613, 'train_samples_per_second': 37.799, 'train_steps_per_second': 4.726, 'total_flos': 3219202817876880.0, 'train_loss': 0.2446178362753386, 'epoch': 3.0})

In [56]:
preds_output = trainer.predict(test_dataset)
y_pred = np.argmax(preds_output.predictions, axis=1)

# Compute metrics
accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred, average="weighted")
recall = recall_score(test_labels, y_pred, average="weighted")
f1 = f1_score(test_labels, y_pred, average="weighted")

# Store and display results
results_df = pd.DataFrame([["BERT", accuracy, precision, recall, f1]],
                          columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"])

from IPython.display import display
display(results_df)

# Save results
results_df.to_csv("bert_model_results.csv", index=False)
print("Results saved to bert_model_results.csv")

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,BERT,0.935587,0.938768,0.935587,0.936389


Results saved to bert_model_results.csv
