# Efficient Multi-class Text Classification Project

This notebook implements a complete pipeline for multi-class text classification including:
1. **Data Handling**: Merging 5 training files, cleaning duplicates/missing values.
2. **Preprocessing**: Modular and vectorized text cleaning.
3. **Representations**: BoW, TF-IDF, GloVe, and Word2Vec Skip-gram.
4. **ML Models**: Logistic Regression, Naive Bayes, Random Forest with Hyperparameter Tuning.
5. **NN Models**: DNN, RNN, GRU, LSTM, Bidirectional architectures.
6. **Evaluation**: Metrics and Visualization.
7. **Deployment Readiness**: Saving all artifacts for the Flask app.

In [None]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, SimpleRNN, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping

from text_pipeline import (
    load_and_merge_data, preprocess_dataframe, build_tokenizer, 
    get_glove_embeddings, train_word2vec_skipgram, build_w2v_matrix, 
    save_artifact, load_artifact
)

import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Ensure models directory exists
os.makedirs('models', exist_ok=True)

## 1. Data Handling & Preprocessing

In [None]:
train_glob = 'Datasets/*[Training]*.csv'
test_path = 'Datasets/[Updated] Question Answer Classification Dataset[Test] (1).csv'

train_df, test_df = load_and_merge_data(train_glob, test_path)
print(f"Total Training Samples: {len(train_df)}")
print(f"Total Test Samples: {len(test_df)}")
train_df.head()

In [None]:
print("Preprocessing Training Data...")
train_df = preprocess_dataframe(train_df)
print("Preprocessing Test Data...")
test_df = preprocess_dataframe(test_df)
train_df.head()

## 2. Feature Engineering
### ML Features: BoW and TF-IDF

In [None]:
max_features = 10000
count_vect = CountVectorizer(max_features=max_features, ngram_range=(1,2))
tfidf_vect = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))

X_train_bow = count_vect.fit_transform(train_df['clean_text'])
X_test_bow = count_vect.transform(test_df['clean_text'])
save_artifact(count_vect, 'models/count_vect.pkl')

X_train_tfidf = tfidf_vect.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf_vect.transform(test_df['clean_text'])
save_artifact(tfidf_vect, 'models/tfidf_vect.pkl')

y_train = train_df['label']
y_test = test_df['label']

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
save_artifact(le, 'models/label_encoder.pkl')

print("BoW Shape:", X_train_bow.shape)
print("TF-IDF Shape:", X_train_tfidf.shape)

### Neural Network Word Representations

In [None]:
max_words = 20000
maxlen = 100
tokenizer, X_train_seq = build_tokenizer(train_df['clean_text'], num_words=max_words, maxlen=maxlen)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['clean_text']), maxlen=maxlen, padding='post', truncating='post')
save_artifact(tokenizer, 'models/tokenizer.pkl')

# 1. Word2Vec Skip-gram
w2v_model = train_word2vec_skipgram(train_df['clean_text'])
embedding_matrix_w2v = build_w2v_matrix(tokenizer.word_index, w2v_model)
np.save('models/embedding_matrix_w2v.npy', embedding_matrix_w2v)

# 2. GloVe (Optional - set glove_path if available)
# glove_path = 'glove.6B.100d.txt'
# embedding_matrix_glove = get_glove_embeddings(tokenizer.word_index, glove_path=glove_path)

## 3. Machine Learning Models
Testing Logistic Regression, Naive Bayes, and Random Forest.

In [None]:
ml_results = []

def evaluate_ml(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"{name} - Accuracy: {acc:.4f}, Macro F1: {f1:.4f}")
    return {'name': name, 'accuracy': acc, 'f1': f1, 'model': model}

# LogReg Tuning
print("Tuning Logistic Regression...")
lr = LogisticRegression(max_iter=500)
param_grid_lr = {'C': [0.1, 1, 10]}
grid_lr = GridSearchCV(lr, param_grid_lr, cv=3, scoring='f1_macro', n_jobs=-1)
grid_lr.fit(X_train_tfidf, y_train)
ml_results.append(evaluate_ml(grid_lr.best_estimator_, X_test_tfidf, y_test, "LogReg-TFIDF"))

# Naive Bayes
print("Training Naive Bayes...")
nb = MultinomialNB()
nb.fit(X_train_bow, y_train)
ml_results.append(evaluate_ml(nb, X_test_bow, y_test, "NaiveBayes-BoW"))

# Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)
ml_results.append(evaluate_ml(rf, X_test_tfidf, y_test, "RandomForest-TFIDF"))

# Save best ML model
best_ml = max(ml_results, key=lambda x: x['f1'])
save_artifact(best_ml['model'], 'models/best_ml_model.pkl')
print(f"Best ML Model: {best_ml['name']}")

## 4. Neural Network Models

In [None]:
num_classes = len(le.classes_)
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

def get_model(arch='lstm', use_embeddings=True):
    model = Sequential()
    if use_embeddings and embedding_matrix_w2v is not None:
        model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix_w2v], input_length=maxlen, trainable=False))
    else:
        model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
        
    if arch == 'dnn':
        model.add(GlobalAveragePooling1D())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(64, activation='relu'))
    elif arch == 'rnn':
        model.add(SimpleRNN(64))
    elif arch == 'lstm':
        model.add(LSTM(64))
    elif arch == 'gru':
        model.add(GRU(64))
    elif arch == 'bilstm':
        model.add(Bidirectional(LSTM(64)))
    
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Training a sample LSTM (Highly efficient)
print("Training LSTM Model...")
nn_model = get_model('lstm')
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = nn_model.fit(X_train_seq, y_train_enc, 
                       validation_data=(X_test_seq, y_test_enc), 
                       epochs=10, batch_size=128, callbacks=[early_stop])

nn_model.save('models/best_nn_model.h5')
print("NN Training Complete.")

## 5. Final Evaluation

In [None]:
# Predictions
y_pred_nn = np.argmax(nn_model.predict(X_test_seq), axis=1)
y_pred_labels = le.inverse_transform(y_pred_nn)

print("Classification Report (Best NN):")
print(classification_report(y_test, y_pred_labels))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_labels)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()