<a href="https://colab.research.google.com/github/mos2025uk-svg/Intelligent-Analysis-of-Pharmaceutical-Product-Descriptions-Using-NLP-and-Deep-Learning/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# NLP & ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Deep Learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Transformers
!pip install -q transformers datasets torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Text processing
import re


In [None]:
#Load Dataset (Upload to Colab)
from google.colab import files
uploaded = files.upload()

df = pd.read_excel("MID.xlsx")  # adjust name if needed
df = df[['description', 'therapeutic_class']].dropna()

print(df.shape)
df.head()


In [None]:
#Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['clean_text'] = df['description'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['therapeutic_class'])

X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)


In [None]:
#Traditional ML Models (TF-IDF)
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
lr = LogisticRegression(max_iter=200)
lr.fit(X_train_tfidf, y_train)

pred_lr = lr.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))


In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

pred_nb = nb.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, pred_nb))


In [None]:
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

pred_svm = svm.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))


In [None]:
#Deep Learning Model (LSTM)
tokenizer = Tokenizer(num_words=40000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

model = Sequential([
    Embedding(40000, 128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


In [None]:
history = model.fit(
    X_train_pad, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=256
)

model.evaluate(X_test_pad, y_test)


In [None]:
#Transformer Model (BERT)
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer_bert(batch['text'], padding=True, truncation=True, max_length=128)

from datasets import Dataset
dataset = Dataset.from_pandas(df[['clean_text', 'label']].rename(columns={'clean_text':'text'}))
dataset = dataset.map(tokenize, batched=True)

dataset = dataset.train_test_split(test_size=0.2)


In [None]:
model_bert = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(le.classes_)
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=500,
    save_total_limit=1
)

trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

trainer.train()


In [None]:
#Extractive (Simple)
def structured_summary(text, max_sentences=3):
    sentences = text.split('. ')
    return '. '.join(sentences[:max_sentences])

df['summary'] = df['description'].apply(structured_summary)
df[['description', 'summary']].head()


In [None]:
#Abstractive (Transformer)
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

sample_text = df['description'].iloc[0][:1024]
summary = summarizer(sample_text, max_length=80, min_length=30, do_sample=False)
summary


In [None]:
#Model Comparison
results = {
    "Model": ["Logistic Regression", "Naive Bayes", "SVM", "LSTM"],
    "Accuracy": [
        accuracy_score(y_test, pred_lr),
        accuracy_score(y_test, pred_nb),
        accuracy_score(y_test, pred_svm),
        model.evaluate(X_test_pad, y_test, verbose=0)[1]
    ]
}

pd.DataFrame(results)
