# SvaraAI Reply Classification Pipeline

## Part A: ML/NLP Pipeline Implementation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import re
import string
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, pipeline
)
import torch
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Asus\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Asus\AppData\Roaming\Python\Python312\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Asus\AppData\Roaming\Python\Python312\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [1]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

NameError: name 'np' is not defined

In [None]:
class ReplyClassificationPipeline:
    def __init__(self):
        self.vectorizer = None
        self.label_encoder = None
        self.baseline_model = None
        self.transformer_model = None
        self.tokenizer = None
        
    def load_and_preprocess_data(self, file_path=None):
        """Load and preprocess the dataset"""
        # Load the dataset from the specified path
        df = pd.read_csv("K:\\Internships\\Svara_Ai\\reply_classification_dataset.csv")
        
        print(f"Dataset shape: {df.shape}")
        print(f"Label distribution:\n{df['label'].value_counts()}")
        
        # Text preprocessing
        df['cleaned_text'] = df['text'].apply(self.clean_text)
        
        # Handle missing values
        df = df.dropna(subset=['cleaned_text', 'label'])
        
        # Encode labels
        self.label_encoder = LabelEncoder()
        df['label_encoded'] = self.label_encoder.fit_transform(df['label'])
        
        return df
    
    def clean_text(self, text):
        """Clean and preprocess text data"""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = str(text).lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove emails
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove punctuation (optional - might keep for transformer)
        # text = text.translate(str.maketrans('', '', string.punctuation))
        
        return text
    
    def train_baseline_model(self, X_train, y_train, X_test, y_test):
        """Train baseline models (Logistic Regression and LightGBM)"""
        # TF-IDF Vectorization
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words='english'
        )
        
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)
        
        results = {}
        
        # Logistic Regression
        lr_model = LogisticRegression(random_state=42, max_iter=1000)
        lr_model.fit(X_train_tfidf, y_train)
        
        y_pred_lr = lr_model.predict(X_test_tfidf)
        results['logistic_regression'] = {
            'model': lr_model,
            'accuracy': accuracy_score(y_test, y_pred_lr),
            'f1_score': f1_score(y_test, y_pred_lr, average='weighted'),
            'predictions': y_pred_lr
        }
        
        # LightGBM
        lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
        lgb_model.fit(X_train_tfidf, y_train)
        
        y_pred_lgb = lgb_model.predict(X_test_tfidf)
        results['lightgbm'] = {
            'model': lgb_model,
            'accuracy': accuracy_score(y_test, y_pred_lgb),
            'f1_score': f1_score(y_test, y_pred_lgb, average='weighted'),
            'predictions': y_pred_lgb
        }
        
        # Choose best baseline model
        if results['logistic_regression']['f1_score'] > results['lightgbm']['f1_score']:
            self.baseline_model = lr_model
            best_baseline = 'logistic_regression'
        else:
            self.baseline_model = lgb_model
            best_baseline = 'lightgbm'
        
        print(f"Baseline Model Results:")
        print(f"Logistic Regression - Accuracy: {results['logistic_regression']['accuracy']:.4f}, F1: {results['logistic_regression']['f1_score']:.4f}")
        print(f"LightGBM - Accuracy: {results['lightgbm']['accuracy']:.4f}, F1: {results['lightgbm']['f1_score']:.4f}")
        print(f"Best baseline model: {best_baseline}")
        
        return results, best_baseline
    
    def prepare_transformer_data(self, texts, labels):
        """Prepare data for transformer model"""
        dataset = Dataset.from_dict({
            'text': texts,
            'labels': labels
        })
        return dataset
    
    def tokenize_function(self, examples):
        """Tokenize text for transformer"""
        return self.tokenizer(
            examples['text'],
            truncation=True,
            padding=True,
            max_length=128
        )
    
    def fine_tune_transformer(self, train_texts, train_labels, val_texts, val_labels):
        """Fine-tune DistilBERT for classification"""
        model_name = "distilbert-base-uncased"
        num_labels = len(np.unique(train_labels))
        
        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=num_labels
        )
        
        # Prepare datasets
        train_dataset = self.prepare_transformer_data(train_texts, train_labels)
        val_dataset = self.prepare_transformer_data(val_texts, val_labels)
        
        # Tokenize
        train_dataset = train_dataset.map(self.tokenize_function, batched=True)
        val_dataset = val_dataset.map(self.tokenize_function, batched=True)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            warmup_steps=100,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
        )
        
        # Train model
        print("Training DistilBERT model...")
        trainer.train()
        
        # Save model
        trainer.save_model('./distilbert_model')
        
        # Create pipeline for inference
        self.transformer_model = pipeline(
            "text-classification",
            model='./distilbert_model',
            tokenizer=self.tokenizer,
            return_all_scores=True
        )
        
        return trainer
    
    def evaluate_transformer(self, test_texts, test_labels):
        """Evaluate transformer model"""
        predictions = []
        
        for text in test_texts:
            result = self.transformer_model(text)
            # Get the label with highest score
            predicted_class = max(result, key=lambda x: x['score'])['label']
            # Convert label back to original format
            if predicted_class == 'LABEL_0':
                predictions.append(0)
            elif predicted_class == 'LABEL_1':
                predictions.append(1)
            else:
                predictions.append(2)
        
        accuracy = accuracy_score(test_labels, predictions)
        f1 = f1_score(test_labels, predictions, average='weighted')
        
        print(f"DistilBERT - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
        
        return {
            'accuracy': accuracy,
            'f1_score': f1,
            'predictions': predictions
        }
    
    def compare_models(self, baseline_results, transformer_results, test_labels):
        """Compare all models and provide a professional recommendation."""
        print("\n" + "="*50)
        print("MODEL PERFORMANCE COMPARISON")
        print("="*50)
        
        models_comparison = {
            'Logistic Regression': baseline_results['logistic_regression'],
            'LightGBM': baseline_results['lightgbm'],
            'DistilBERT': transformer_results
        }
        
        # Create comparison DataFrame
        comparison_df = pd.DataFrame({
            'Model': list(models_comparison.keys()),
            'Accuracy': [results['accuracy'] for results in models_comparison.values()],
            'F1 Score': [results['f1_score'] for results in models_comparison.values()]
        })
        
        print(comparison_df.to_string())
        
        # Recommendation logic
        best_model = comparison_df.loc[comparison_df['F1 Score'].idxmax(), 'Model']
        
        print(f"\nPRODUCTION RECOMMENDATION: {best_model}")
        
        # Production considerations
        print("\nKEY PRODUCTION CONSIDERATIONS:")
        if best_model in ['Logistic Regression', 'LightGBM']:
            print(f"Based on the evaluation, {best_model} is recommended due to:")
            print("  - Fast inference speed")
            print("  - Lower computational requirements")
            print("  - High interpretability")
            print("  - Ease of deployment and maintenance")
        else:
            print(f"Based on the evaluation, {best_model} is recommended due to:")
            print("  - Superior performance on complex text patterns")
            print("  - Enhanced contextual understanding")
            print("  - Robustness to varied text formats")
            print("  - The performance gain justifies the additional computational cost")
        
        return best_model, comparison_df

## Main execution

In [None]:

if __name__ == "__main__":
    # Initialize pipeline
    pipeline = ReplyClassificationPipeline()
    
    # Load and preprocess data
    print("Loading and preprocessing data...")
    df = pipeline.load_and_preprocess_data()
    
    # Split data
    X = df['cleaned_text']
    y = df['label_encoded']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train baseline models
    print("\nTraining baseline models...")
    baseline_results, best_baseline = pipeline.train_baseline_model(
        X_train, y_train, X_test, y_test
    )
    
    # Fine-tune transformer
    print("\nFine-tuning DistilBERT...")
    trainer = pipeline.fine_tune_transformer(
        X_train.tolist(), y_train.tolist(),
        X_test.tolist(), y_test.tolist()
    )
    
    # Evaluate transformer
    print("\nEvaluating DistilBERT...")
    transformer_results = pipeline.evaluate_transformer(
        X_test.tolist(), y_test.tolist()
    )
    
    # Compare models and make recommendation
    best_model, comparison_df = pipeline.compare_models(
        baseline_results, transformer_results, y_test
    )
    
    print(f"\n🎯 Pipeline completed successfully!")
    print(f"Best performing model: {best_model}")
    
    # Save results
    comparison_df.to_csv('model_comparison_results.csv', index=False)
    print("Results saved to 'model_comparison_results.csv'")