# Assignment 2

### Naive Bayes Classifier

In [24]:
# Imports for Assignment 2
import pandas as pd
import numpy as np
import re
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [25]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rony2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rony2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:
# Loading Data
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

In [27]:
# Data Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    return " ".join(tokens)

In [28]:
train_df['clean'] = train_df['Description'].apply(clean_text)
test_df ['clean'] = test_df ['Description'].apply(clean_text)

In [29]:
# Basic Feature Extraction (Baseline)
vectorizer = CountVectorizer(binary=True)
X_counts = vectorizer.fit_transform(train_df['clean'])
y = train_df['Class']

In [30]:
# Naive Bayes Classifier Implementation
class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    def fit(self, X, y):
        n_docs, n_feats = X.shape
        self.classes = np.unique(y)
        self.log_prior = {}
        self.log_likelihood = {}
        for c in self.classes:
            idx = np.where(y == c)[0]
            X_c = X[idx]
            self.log_prior[c] = np.log(len(idx) / n_docs)
            counts = np.array(X_c.sum(axis=0)).flatten() + self.alpha
            total = counts.sum()
            self.log_likelihood[c] = np.log(counts / total)
        return self    
    def predict(self, X):
        results = []
        for i in range(X.shape[0]):
            row = X[i].toarray().flatten()
            scores = {c: self.log_prior[c] + (row * self.log_likelihood[c]).sum()
                      for c in self.classes}
            results.append(max(scores, key=scores.get))
        return np.array(results)

In [48]:
# Validate Baseline
X_train, X_val, y_train, y_val = train_test_split(X_counts, y, test_size=0.2, random_state=10000)
base_clf = NaiveBayesClassifier(alpha=1.0)
base_clf.fit(X_train, y_train)
base_preds = base_clf.predict(X_val)
print("Baseline Accuracy:", accuracy_score(y_val, base_preds))
print(classification_report(y_val, base_preds))

Baseline Accuracy: 0.9647727272727272
              precision    recall  f1-score   support

           A       0.97      1.00      0.99       349
           G       1.00      0.87      0.93        46
           S       0.53      0.90      0.67        20
           W       0.99      0.95      0.97       465

    accuracy                           0.96       880
   macro avg       0.87      0.93      0.89       880
weighted avg       0.97      0.96      0.97       880



In [52]:
# Improved Model with TF-IDF & Bigrams
# CountVectorizer with bigrams + frequency filters
erange = (1,2)
vectorizer2 = CountVectorizer(ngram_range=erange, max_df=0.8, min_df=5)
X_counts2 = vectorizer2.fit_transform(train_df['clean'])

In [53]:
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_counts2)

In [54]:
# Wrap custom NB in sklearn interface
from sklearn.base import BaseEstimator, ClassifierMixin
class SklearnNB(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.model = NaiveBayesClassifier(alpha)
    def fit(self, X, y):
        self.model = self.model.fit(X, y)
        return self
    def predict(self, X):
        return self.model.predict(X)

In [55]:
# Hyperparameter tuning for alpha
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
grid = GridSearchCV(SklearnNB(), param_grid, cv=3, scoring='accuracy')
grid.fit(X_tfidf, y)
best_alpha = grid.best_params_['alpha']
print("Best smoothing alpha:", best_alpha)

Best smoothing alpha: 0.1


In [61]:
# Evaluate improved on validation split
X_train2, X_val2, y_train2, y_val2 = train_test_split(X_tfidf, y, test_size=0.2, random_state=777)
imp_clf = NaiveBayesClassifier(alpha=best_alpha)
imp_clf.fit(X_train2, y_train2)
imp_preds = imp_clf.predict(X_val2)
print("Improved Accuracy:", accuracy_score(y_val2, imp_preds))
print(classification_report(y_val2, imp_preds))

Improved Accuracy: 0.9840909090909091
              precision    recall  f1-score   support

           A       0.98      0.99      0.99       355
           G       1.00      0.97      0.99        39
           S       0.87      0.80      0.83        25
           W       0.99      0.99      0.99       461

    accuracy                           0.98       880
   macro avg       0.96      0.94      0.95       880
weighted avg       0.98      0.98      0.98       880



In [62]:
# Final Training & Kaggle Submission
# Retrain on full data
full_counts = vectorizer2.fit_transform(train_df['clean'])
full_tfidf   = tfidf.fit_transform(full_counts)
final_clf = NaiveBayesClassifier(alpha=best_alpha)
final_clf.fit(full_tfidf, y)

<__main__.NaiveBayesClassifier at 0x2e8818788d0>

In [63]:
# Predict on test set
X_test_counts = vectorizer2.transform(test_df['clean'])
X_test_tfidf   = tfidf.transform(X_test_counts)
test_preds = final_clf.predict(X_test_tfidf)

In [65]:
# Prepare submission file: automatically detect ID column
print("Available columns in test_df: ", test_df.columns.tolist())
# Determine the appropriate ID column (e.g., 'id', 'ID', or the first column)
id_col = None
for candidate in ['id', 'ID']:
    if candidate in test_df.columns:
        id_col = candidate
        break
# Fallback to first column if no standard name found
if id_col is None:
    id_col = test_df.columns[0]

submission = pd.DataFrame({
    id_col: test_df[id_col],
    'Class': test_preds
})
submission.to_csv('submission.csv', index=False)
print(f"Submission file created using '{id_col}' as the identifier column.")

Available columns in test_df:  ['Id', 'Description', 'clean']
Submission file created using 'Id' as the identifier column.
