**2 Naive Bayes Classifier for Email Classification.**

In [49]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Step -1- Load the Data:**

In [50]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Download stopwords
nltk.download('stopwords')

# Load dataset (FIX FOR ParserError)
df = pd.read_csv(
    "/content/spam_ham_dataset.csv",
    engine="python",          # ← REQUIRED
    on_bad_lines="skip",      # ← REQUIRED
    index_col=0
)

# Create label_num if missing
if 'label_num' not in df.columns:
    if 'label' in df.columns:
        df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
    elif 'Category' in df.columns:
        df['label_num'] = df['Category'].map({'ham': 0, 'spam': 1})
    elif 'v1' in df.columns:
        df['label_num'] = df['v1'].map({'ham': 0, 'spam': 1})
    else:
        raise ValueError("No valid label column found")

# Text Cleaning
length = len(df['text'])

corpus = []
ps = PorterStemmer()
all_stopwords = stopwords.words('english')

for i in range(length):
    text = str(df['text'].iloc[i])        # bytes-safe
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if word not in set(all_stopwords)]
    text = ' '.join(text)
    corpus.append(text)

# Verify & remove "subject"
data_check = df.copy()
data_check['cleanText'] = corpus
data_check['cleanText'] = data_check['cleanText'].str.replace(
    'subject', '', regex=False
)

# Feature & label
x = data_check['cleanText'].values
y = data_check['label_num'].values

# Count Vectorization
cv = CountVectorizer()
x = cv.fit_transform(x).toarray()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=42
)

# Naive Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9755154639175257
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      1121
           1       0.95      0.96      0.96       431

    accuracy                           0.98      1552
   macro avg       0.97      0.97      0.97      1552
weighted avg       0.98      0.98      0.98      1552



**PART A — Naive Bayes (IMDB Sentiment Analysis)**

**1. Load & Preprocess IMDB Dataset**

In [51]:
# 2.2 Sentiment Analysis – IMDB Movie Review Dataset
# Import Libraries
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Download NLTK Stopwords
nltk.download('stopwords')

# Load Dataset (ParserError FIXED)
df = pd.read_csv(
    "IMDB Dataset.csv",
    engine="python",        # required for broken quotes
    on_bad_lines="skip"     # skip malformed rows
)

# Encode Target Labels
df['sentiment'] = df['sentiment'].map({
    'positive': 1,
    'negative': 0
})

# Text Preprocessing
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()                  # lowercase
    text = re.sub('[^a-zA-Z]', ' ', text)     # remove punctuation
    words = text.split()                      # tokenize
    words = [ps.stem(w) for w in words if w not in stop_words]  # stopwords + stemming
    return ' '.join(words)

df['clean_review'] = df['review'].apply(preprocess_text)

# Train-Test Split (80 / 20)
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Bag of Words (CountVectorizer)
cv = CountVectorizer(max_features=5000)

X_train_vec = cv.fit_transform(X_train)
X_test_vec = cv.transform(X_test)

# Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Prediction & Evaluation
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.8494

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.85      4961
           1       0.85      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000


Confusion Matrix:
 [[4239  722]
 [ 784 4255]]


**PART B — Feature Selection (Wrapper Method: RFE)**

**1. Load Breast Cancer Dataset**

In [58]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names


**2. Train-Test Split**

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**3. Recursive Feature Elimination (Top 5 Features)**

In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=5)

rfe.fit(X_train_scaled, y_train)

selected_features = feature_names[rfe.support_]
print("Selected Features:", selected_features)


Selected Features: ['radius error' 'worst radius' 'worst texture' 'worst area'
 'worst concave points']


**4. Train Model with Selected Features**

In [61]:
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

model.fit(X_train_rfe, y_train)
y_pred_rfe = model.predict(X_test_rfe)


**5. Evaluation (Selected vs All Features)**

In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

print("Accuracy:", accuracy_score(y_test, y_pred_rfe))
print("Precision:", precision_score(y_test, y_pred_rfe))
print("Recall:", recall_score(y_test, y_pred_rfe))
print("F1:", f1_score(y_test, y_pred_rfe))
print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test_rfe)[:,1]))


Accuracy: 0.956140350877193
Precision: 0.9583333333333334
Recall: 0.971830985915493
F1: 0.965034965034965
ROC-AUC: 0.990501146413364


**6. Experiment: Different Feature Counts**

In [63]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for k in [3, 7]:
    rfe = RFE(
        estimator=LogisticRegression(max_iter=2000),
        n_features_to_select=k
    )

    rfe.fit(X_train_scaled, y_train)

    X_train_k = rfe.transform(X_train_scaled)
    X_test_k = rfe.transform(X_test_scaled)

    model = LogisticRegression(max_iter=2000)
    model.fit(X_train_k, y_train)

    y_pred_k = model.predict(X_test_k)

    print(f"Top {k} Features Accuracy:", accuracy_score(y_test, y_pred_k))


Top 3 Features Accuracy: 0.9649122807017544
Top 7 Features Accuracy: 0.9736842105263158
