# 📊 NLP | Medical Abstract Segmentation

## 🏥 Project Overview
This notebook aims to **reproduce the Kaggle NLP Medical Abstract Segmentation project** using the **PubMed 20k RCT dataset**.

### **🔹 Key Steps**
- Load and inspect the dataset
- Apply **text preprocessing** (stopwords removal, punctuation cleanup, lowercasing)
- Perform **Exploratory Data Analysis (EDA)**
- Train a model for **classifying abstract segments**

## 🚀 Step 1: Load & Inspect the Data

In [None]:
import pandas as pd
import os
import nltk
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Download stopwords if not already available
nltk.download("stopwords")
nltk.download("punkt")

# Define dataset paths
data_folder = "NLP-Medical-Abstract-Segmentation/data"
train_path = os.path.join(data_folder, "train.csv")
dev_path = os.path.join(data_folder, "dev.csv")
test_path = os.path.join(data_folder, "test.csv")

# Load datasets
df_train = pd.read_csv(train_path)
df_dev = pd.read_csv(dev_path)
df_test = pd.read_csv(test_path)

# Display dataset info
print(f"Train Data Shape: {df_train.shape}")
print(f"Dev Data Shape: {df_dev.shape}")
print(f"Test Data Shape: {df_test.shape}")

df_train.head()


In [None]:
# Define stopwords and punctuation removal
stop_words = set(stopwords.words("english"))

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, punctuation, and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove links (if any)
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Tokenize words and remove stopwords
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    
    return " ".join(words)

# Apply cleaning to the dataset
df_train["cleaned_sentence"] = df_train["sentence"].apply(clean_text)

# Show cleaned data
df_train[["sentence", "cleaned_sentence"]].head()


In [None]:
# Plot label distribution
plt.figure(figsize=(10, 5))
sns.countplot(data=df_train, x="target", order=df_train["target"].value_counts().index, palette="coolwarm")
plt.title("Distribution of Sentence Labels in Train Data")
plt.xlabel("Sentence Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Generate word cloud for most common words
text = " ".join(df_train["cleaned_sentence"])

wordcloud = WordCloud(width=800, height=400, background_color="white", colormap="coolwarm", max_words=200).generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Most Common Words in Medical Abstracts")
plt.show()
