# Assignment 2

### Naive Bayes Classifier

In [None]:
# Imports for Assignment 2
import pandas as pd
import numpy as np
import re
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rony2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rony2\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [3]:
# Loading Data
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

In [4]:
# Data Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    return " ".join(tokens)

In [16]:
train_df['clean'] = train_df['Description'].apply(clean_text)
test_df ['clean'] = test_df ['Description'].apply(clean_text)

In [17]:
# Basic Feature Extraction (Baseline)
vectorizer = CountVectorizer(binary=True)
X_counts = vectorizer.fit_transform(train_df['clean'])
y = train_df['Class']

In [18]:
# Naive Bayes Classifier Code
class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    def fit(self, X, y):
        n_docs, n_feats = X.shape
        self.classes = np.unique(y)
        self.log_prior = {}
        self.log_likelihood = {}
        for c in self.classes:
            X_c = X[y == c]
            # Prior
            self.log_prior[c] = np.log(X_c.shape[0] / n_docs)
            # Likelihood with smoothing
            counts = np.array(X_c.sum(axis=0)).flatten() + self.alpha
            total = counts.sum()
            self.log_likelihood[c] = np.log(counts / total)
        self.vocab_size = n_feats
    def predict(self, X):
        results = []
        for i in range(X.shape[0]):
            row = X[i].toarray().flatten()
            scores = {c: self.log_prior[c] + (row * self.log_likelihood[c]).sum()
                      for c in self.classes}
            # choose best
            results.append(max(scores, key=scores.get))
        return np.array(results)

In [19]:
# 5. Validate Baseline
X_train, X_val, y_train, y_val = train_test_split(X_counts, y, test_size=0.2, random_state=42)
base_clf = NaiveBayesClassifier(alpha=1.0)
base_clf.fit(X_train, y_train)
base_preds = base_clf.predict(X_val)
print("Baseline Accuracy:", accuracy_score(y_val, base_preds))
print(classification_report(y_val, base_preds))

AttributeError: 'Series' object has no attribute 'nonzero'

In [None]:
# 6. Improved Model with TF-IDF & Bigrams
# Vectorizer with bigrams and TF-IDF
vectorizer2 = CountVectorizer(ngram_range=(1,2), max_df=0.8, min_df=5)
X_counts2 = vectorizer2.fit_transform(train_df['clean'])

In [None]:
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_counts2)

In [None]:
# Tune alpha
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0]}
clf = NaiveBayesClassifier()

In [None]:
grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, scoring='accuracy')
# wrap custom classifier to sklearn interface
from sklearn.base import BaseEstimator, ClassifierMixin
class SklearnNB(BaseEstimator, ClassifierMixin):
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.inner = NaiveBayesClassifier(alpha)
    def fit(self, X, y):
        self.inner.fit(X, y)
        return self
    def predict(self, X):
        return self.inner.predict(X)

In [None]:
grid = GridSearchCV(SklearnNB(), param_grid, cv=3, scoring='accuracy')
grid.fit(X_tfidf, y)
print("Best alpha:", grid.best_params_)

In [None]:
# Evaluate Improved Model
best_alpha = grid.best_params_['alpha']
imp_clf = NaiveBayesClassifier(alpha=best_alpha)
imp_clf.fit(X_tfidf[X_train.indices], y_train)  # fit on training partition
imp_preds = imp_clf.predict(X_tfidf[X_val.indices])
print("Improved Accuracy:", accuracy_score(y_val, imp_preds))
print(classification_report(y_val, imp_preds))

In [None]:
# 7. Final Training & Kaggle Submission
# Retrain on full train data
full_counts = vectorizer2.fit_transform(train_df['clean'])
full_tfidf   = tfidf.fit_transform(full_counts)
final_clf = NaiveBayesClassifier(alpha=best_alpha)
final_clf.fit(full_tfidf, y)

In [None]:
# Predict on test
X_test_counts = vectorizer2.transform(test_df['clean'])
X_test_tfidf   = tfidf.transform(X_test_counts)
test_preds = final_clf.predict(X_test_tfidf)

In [None]:
#submission = pd.DataFrame({
#    'id': test_df['id'],
#    'label': test_preds
#})
#submission.to_csv('submission.csv', index=False)