In [9]:
import pandas as pd

df_train_feature_engineered = pd.read_csv("dataset/feature_enginering/train_feature_engineered.csv")
df_test_feature_engineered = pd.read_csv("dataset/feature_enginering/test_feature_engineered.csv")
df_holdout_feature_engineered = pd.read_csv("dataset/feature_enginering/holdout_feature_engineered.csv")


print(df_train_feature_engineered.shape, df_test_feature_engineered.shape, df_holdout_feature_engineered.shape)

(8888, 6) (2335, 6) (1167, 6)


In [5]:
from typing import List, Union
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


def create_tfidf_vectorizer(
    corpus: Union[List[str], pd.Series],
    **kwargs,
) -> TfidfVectorizer:
    """Create and fit a TfidfVectorizer on a text corpus, then return it."""
    vectorizer = TfidfVectorizer(**kwargs)
    vectorizer.fit(list(corpus))
    return vectorizer


def transform_comment_to_tfidf(
    comments: Union[str, List[str], pd.Series],
    *,
    vectorizer: TfidfVectorizer,
) -> csr_matrix:
    """
    Transform a single comment or a list/Series of comments to TF-IDF vector(s)
    using a fitted sklearn.feature_extraction.text.TfidfVectorizer.
    """
    if isinstance(comments, str):
        texts = [comments]
    elif isinstance(comments, (list, pd.Series)):
        texts = list(comments)
    else:
        raise TypeError("comments must be a str, list[str], or pandas.Series of str")

    return vectorizer.transform(texts)

# Example (illustrative):
# vec = create_tfidf_vectorizer(df_train_feature_engineered["comment"],
#                               lowercase=True, ngram_range=(1,2), max_features=50000)
# X = transform_comment_to_tfidf(["contoh komentar"], vectorizer=vec)
# X_single = transform_comment_to_tfidf("komentar tunggal", vectorizer=vec)


In [None]:
# =============================================================================
# IBM PIPELINE: CONFIGURABLE TEXT CLASSIFICATION PIPELINE
# =============================================================================
# This pipeline supports:
# - Configurable vectorizers (TF-IDF, Word2Vec, Sentence Transformers)
# - Configurable models (Logistic Regression, Random Forest, SVM, etc.)
# - Automatic feature engineering
# - SMOTE balancing
# - Model persistence and evaluation

import os
import pickle
import numpy as np
import pandas as pd
import emoji
from abc import ABC, abstractmethod
from typing import Dict, Any, Union, List, Tuple, Optional, Sequence
from dataclasses import dataclass
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION CLASSES
# =============================================================================

@dataclass
class PipelineConfig:
    """Configuration for the entire pipeline"""
    # Data paths
    train_path: str = "dataset/feature_enginering/train_feature_engineered.csv"
    test_path: str = "dataset/feature_enginering/test_feature_engineered.csv"
    holdout_path: str = "dataset/feature_enginering/holdout_feature_engineered.csv"
    
    # Feature engineering
    text_column: str = "comment"
    label_column: str = "label"
    numeric_features: List[str] = None
    
    # Vectorizer settings
    vectorizer_type: str = "tfidf"  # tfidf, word2vec, sentence_transformer
    vectorizer_params: Dict[str, Any] = None
    
    # Model settings
    model_type: str = "logistic_regression"  # logistic_regression, random_forest, svm
    model_params: Dict[str, Any] = None
    
    # SMOTE settings
    use_smote: bool = True
    smote_params: Dict[str, Any] = None
    
    # Output settings
    output_dir: str = "models"
    save_artifacts: bool = True
    
    def __post_init__(self):
        if self.numeric_features is None:
            self.numeric_features = ["text_length", "count_word", "num_emoji", "num_foreign_character"]
        
        if self.vectorizer_params is None:
            self.vectorizer_params = {}
            
        if self.model_params is None:
            self.model_params = {}
            
        if self.smote_params is None:
            self.smote_params = {"sampling_strategy": "minority", "random_state": 42}

# =============================================================================
# FEATURE ENGINEERING MODULE
# =============================================================================

class FeatureEngineer:
    """Handles text feature engineering"""
    
    @staticmethod
    def count_emojis(text: str) -> int:
        """Count emojis in text"""
        if not isinstance(text, str):
            return 0
        return len(emoji.emoji_list(text))
    
    @staticmethod
    def count_foreign_alpha_characters(text: str) -> int:
        """Count non-ASCII alphabetic characters"""
        if not isinstance(text, str):
            return 0
        count = 0
        for ch in text:
            if ch.isalpha() and ord(ch) > 127:
                count += 1
        return count
    
    @classmethod
    def compute_text_features(cls, text_col) -> Dict[str, Any]:
        """Compute text features for single text or pandas Series"""
        if isinstance(text_col, str):
            s = text_col
            return {
                "text_length": len(s),
                "count_word": len(s.split()),
                "num_emoji": cls.count_emojis(s),
                "num_foreign_character": cls.count_foreign_alpha_characters(s),
            }
        elif hasattr(text_col, "apply"):
            s = text_col.fillna("")
            return {
                "text_length": s.str.len(),
                "count_word": s.str.split().str.len(),
                "num_emoji": s.apply(cls.count_emojis),
                "num_foreign_character": s.apply(cls.count_foreign_alpha_characters),
            }
        else:
            raise TypeError("Input must be a string or a pandas Series.")
    
    @classmethod
    def add_text_features(cls, df: pd.DataFrame, text_col: str = "comment") -> pd.DataFrame:
        """Add text features to dataframe"""
        df = df.copy()
        features = cls.compute_text_features(df[text_col])
        for feature_name, feature_values in features.items():
            df[feature_name] = feature_values
        return df

# =============================================================================
# VECTORIZER ABSTRACTION
# =============================================================================

class BaseVectorizer(ABC):
    """Abstract base class for text vectorizers"""
    
    @abstractmethod
    def fit(self, texts: List[str]) -> 'BaseVectorizer':
        """Fit the vectorizer on texts"""
        pass
    
    @abstractmethod
    def transform(self, texts: List[str]) -> np.ndarray:
        """Transform texts to vectors"""
        pass
    
    @abstractmethod
    def save(self, path: str) -> None:
        """Save vectorizer to disk"""
        pass
    
    @classmethod
    @abstractmethod
    def load(cls, path: str) -> 'BaseVectorizer':
        """Load vectorizer from disk"""
        pass

class TfidfVectorizerWrapper(BaseVectorizer):
    """TF-IDF vectorizer wrapper"""
    
    def __init__(self, **params):
        self.vectorizer = TfidfVectorizer(**params)
        self.is_fitted = False
    
    def fit(self, texts: List[str]) -> 'TfidfVectorizerWrapper':
        self.vectorizer.fit(texts)
        self.is_fitted = True
        return self
    
    def transform(self, texts: List[str]) -> np.ndarray:
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted before transform")
        return self.vectorizer.transform(texts).toarray()
    
    def save(self, path: str) -> None:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as f:
            pickle.dump(self, f)
    
    @classmethod
    def load(cls, path: str) -> 'TfidfVectorizerWrapper':
        with open(path, 'rb') as f:
            return pickle.load(f)

class Word2VecVectorizer(BaseVectorizer):
    """Word2Vec vectorizer wrapper"""
    
    def __init__(self, **params):
        self.params = params
        self.model = None
        self.is_fitted = False
    
    def _tokenize(self, texts: List[str]) -> List[List[str]]:
        """Simple tokenization - can be enhanced"""
        return [text.lower().split() for text in texts]
    
    def fit(self, texts: List[str]) -> 'Word2VecVectorizer':
        tokenized_texts = self._tokenize(texts)
        self.model = Word2Vec(sentences=tokenized_texts, **self.params)
        self.is_fitted = True
        return self
    
    def transform(self, texts: List[str]) -> np.ndarray:
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted before transform")
        
        tokenized_texts = self._tokenize(texts)
        vectors = []
        for tokens in tokenized_texts:
            if tokens:
                # Average word vectors for the sentence
                word_vectors = [self.model.wv[word] for word in tokens if word in self.model.wv]
                if word_vectors:
                    vectors.append(np.mean(word_vectors, axis=0))
                else:
                    vectors.append(np.zeros(self.model.wv.vector_size))
            else:
                vectors.append(np.zeros(self.model.wv.vector_size))
        
        return np.array(vectors)
    
    def save(self, path: str) -> None:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as f:
            pickle.dump(self, f)
    
    @classmethod
    def load(cls, path: str) -> 'Word2VecVectorizer':
        with open(path, 'rb') as f:
            return pickle.load(f)

class SentenceTransformerVectorizer(BaseVectorizer):
    """Sentence Transformer vectorizer wrapper"""
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2", **params):
        self.model_name = model_name
        self.model = None
        self.is_fitted = False
    
    def fit(self, texts: List[str]) -> 'SentenceTransformerVectorizer':
        self.model = SentenceTransformer(self.model_name)
        self.is_fitted = True
        return self
    
    def transform(self, texts: List[str]) -> np.ndarray:
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted before transform")
        return self.model.encode(texts)
    
    def save(self, path: str) -> None:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as f:
            pickle.dump(self, f)
    
    @classmethod
    def load(cls, path: str) -> 'SentenceTransformerVectorizer':
        with open(path, 'rb') as f:
            return pickle.load(f)

# =============================================================================
# MODEL ABSTRACTION
# =============================================================================

class BaseModel(ABC):
    """Abstract base class for models"""
    
    @abstractmethod
    def fit(self, X: np.ndarray, y: np.ndarray) -> 'BaseModel':
        """Fit the model"""
        pass
    
    @abstractmethod
    def predict(self, X: np.ndarray) -> np.ndarray:
        """Make predictions"""
        pass
    
    @abstractmethod
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """Predict probabilities"""
        pass
    
    @abstractmethod
    def save(self, path: str) -> None:
        """Save model to disk"""
        pass
    
    @classmethod
    @abstractmethod
    def load(cls, path: str) -> 'BaseModel':
        """Load model from disk"""
        pass

class SklearnModelWrapper(BaseModel):
    """Wrapper for sklearn models"""
    
    def __init__(self, model_class, **params):
        self.model_class = model_class
        self.params = params
        self.model = None
        self.is_fitted = False
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> 'SklearnModelWrapper':
        self.model = self.model_class(**self.params)
        self.model.fit(X, y)
        self.is_fitted = True
        return self
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        if not self.is_fitted:
            raise ValueError("Model must be fitted before predict")
        return self.model.predict(X)
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        if not self.is_fitted:
            raise ValueError("Model must be fitted before predict_proba")
        if hasattr(self.model, 'predict_proba'):
            return self.model.predict_proba(X)
        else:
            # For models without predict_proba, return hard predictions
            predictions = self.predict(X)
            proba = np.zeros((len(predictions), 2))
            proba[np.arange(len(predictions)), predictions] = 1
            return proba
    
    def save(self, path: str) -> None:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as f:
            pickle.dump(self, f)
    
    @classmethod
    def load(cls, path: str) -> 'SklearnModelWrapper':
        with open(path, 'rb') as f:
            return pickle.load(f)

# =============================================================================
# MAIN PIPELINE CLASS
# =============================================================================

class TextClassificationPipeline:
    """Main pipeline class for text classification"""
    
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.vectorizer = None
        self.scaler = None
        self.model = None
        self.feature_engineer = FeatureEngineer()
    
    def _create_vectorizer(self) -> BaseVectorizer:
        """Create vectorizer based on config"""
        vectorizer_type = self.config.vectorizer_type.lower()
        params = self.config.vectorizer_params.copy()
        
        if vectorizer_type == "tfidf":
            return TfidfVectorizerWrapper(**params)
        elif vectorizer_type == "word2vec":
            return Word2VecVectorizer(**params)
        elif vectorizer_type == "sentence_transformer":
            return SentenceTransformerVectorizer(**params)
        else:
            raise ValueError(f"Unsupported vectorizer type: {vectorizer_type}")
    
    def _create_model(self) -> BaseModel:
        """Create model based on config"""
        model_type = self.config.model_type.lower()
        params = self.config.model_params.copy()
        
        if model_type == "logistic_regression":
            return SklearnModelWrapper(LogisticRegression, **params)
        elif model_type == "random_forest":
            return SklearnModelWrapper(RandomForestClassifier, **params)
        elif model_type == "svm":
            return SklearnModelWrapper(SVC, **params)
        else:
            raise ValueError(f"Unsupported model type: {model_type}")
    
    def _load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """Load train, test, and holdout data"""
        train_df = pd.read_csv(self.config.train_path)
        test_df = pd.read_csv(self.config.test_path)
        holdout_df = pd.read_csv(self.config.holdout_path)
        
        # Add text features if not already present
        for df_name, df in [("train", train_df), ("test", test_df), ("holdout", holdout_df)]:
            if not all(col in df.columns for col in self.config.numeric_features):
                print(f"Adding text features to {df_name} data...")
                if df_name == "train":
                    train_df = self.feature_engineer.add_text_features(df, self.config.text_column)
                elif df_name == "test":
                    test_df = self.feature_engineer.add_text_features(df, self.config.text_column)
                else:
                    holdout_df = self.feature_engineer.add_text_features(df, self.config.text_column)
        
        return train_df, test_df, holdout_df
    
    def _prepare_features(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare features for training/prediction"""
        # Text features
        texts = df[self.config.text_column].astype(str).tolist()
        X_text = self.vectorizer.transform(texts)
        
        # Numeric features
        X_numeric = df[self.config.numeric_features].astype(float).fillna(0.0).values
        
        # Scale numeric features
        if self.scaler is None:
            self.scaler = StandardScaler()
            X_numeric_scaled = self.scaler.fit_transform(X_numeric)
        else:
            X_numeric_scaled = self.scaler.transform(X_numeric)
        
        # Combine features
        X_combined = np.hstack([X_text, X_numeric_scaled])
        
        # Labels
        y = df[self.config.label_column].astype(int).values
        
        return X_combined, y
    
    def fit(self) -> 'TextClassificationPipeline':
        """Fit the entire pipeline"""
        print("Loading data...")
        train_df, test_df, holdout_df = self._load_data()
        
        print("Creating vectorizer...")
        self.vectorizer = self._create_vectorizer()
        
        print("Fitting vectorizer...")
        self.vectorizer.fit(train_df[self.config.text_column].astype(str).tolist())
        
        print("Preparing training features...")
        X_train, y_train = self._prepare_features(train_df)
        
        print(f"Training data shape: {X_train.shape}")
        print(f"Label distribution: {np.bincount(y_train)}")
        
        # Apply SMOTE if enabled
        if self.config.use_smote:
            print("Applying SMOTE...")
            smote = SMOTE(**self.config.smote_params)
            X_train, y_train = smote.fit_resample(X_train, y_train)
            print(f"After SMOTE - Training data shape: {X_train.shape}")
            print(f"After SMOTE - Label distribution: {np.bincount(y_train)}")
        
        print("Creating and training model...")
        self.model = self._create_model()
        self.model.fit(X_train, y_train)
        
        print("Pipeline fitted successfully!")
        return self
    
    def evaluate(self, test_df: pd.DataFrame = None) -> Dict[str, Any]:
        """Evaluate the pipeline"""
        if test_df is None:
            _, test_df, _ = self._load_data()
        
        X_test, y_test = self._prepare_features(test_df)
        y_pred = self.model.predict(X_test)
        y_proba = self.model.predict_proba(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        
        results = {
            "accuracy": accuracy,
            "predictions": y_pred,
            "probabilities": y_proba,
            "classification_report": classification_report(y_test, y_pred),
            "confusion_matrix": confusion_matrix(y_test, y_pred)
        }
        
        print(f"Test Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(results["classification_report"])
        
        return results
    
    def save(self, output_dir: str = None) -> None:
        """Save pipeline artifacts"""
        if output_dir is None:
            output_dir = self.config.output_dir
        
        os.makedirs(output_dir, exist_ok=True)
        
        # Save vectorizer
        vectorizer_path = os.path.join(output_dir, f"{self.config.vectorizer_type}_vectorizer.pkl")
        self.vectorizer.save(vectorizer_path)
        print(f"Vectorizer saved to: {vectorizer_path}")
        
        # Save scaler
        scaler_path = os.path.join(output_dir, "feature_scaler.pkl")
        with open(scaler_path, 'wb') as f:
            pickle.dump(self.scaler, f)
        print(f"Scaler saved to: {scaler_path}")
        
        # Save model
        model_path = os.path.join(output_dir, f"{self.config.model_type}_{self.config.vectorizer_type}_model.pkl")
        self.model.save(model_path)
        print(f"Model saved to: {model_path}")
        
        # Save config
        config_path = os.path.join(output_dir, "pipeline_config.pkl")
        with open(config_path, 'wb') as f:
            pickle.dump(self.config, f)
        print(f"Config saved to: {config_path}")
    
    @classmethod
    def load(cls, output_dir: str) -> 'TextClassificationPipeline':
        """Load pipeline from disk"""
        # Load config
        config_path = os.path.join(output_dir, "pipeline_config.pkl")
        with open(config_path, 'rb') as f:
            config = pickle.load(f)
        
        pipeline = cls(config)
        
        # Load vectorizer
        vectorizer_path = os.path.join(output_dir, f"{config.vectorizer_type}_vectorizer.pkl")
        if config.vectorizer_type == "tfidf":
            pipeline.vectorizer = TfidfVectorizerWrapper.load(vectorizer_path)
        elif config.vectorizer_type == "word2vec":
            pipeline.vectorizer = Word2VecVectorizer.load(vectorizer_path)
        elif config.vectorizer_type == "sentence_transformer":
            pipeline.vectorizer = SentenceTransformerVectorizer.load(vectorizer_path)
        
        # Load scaler
        scaler_path = os.path.join(output_dir, "feature_scaler.pkl")
        with open(scaler_path, 'rb') as f:
            pipeline.scaler = pickle.load(f)
        
        # Load model
        model_path = os.path.join(output_dir, f"{config.model_type}_{config.vectorizer_type}_model.pkl")
        pipeline.model = SklearnModelWrapper.load(model_path)
        
        return pipeline

# =============================================================================
# EXAMPLE USAGE AND CONFIGURATIONS
# =============================================================================

def create_tfidf_logreg_config() -> PipelineConfig:
    """Create config for TF-IDF + Logistic Regression"""
    return PipelineConfig(
        vectorizer_type="tfidf",
        vectorizer_params={
            "max_features": 50000,
            "ngram_range": (1, 2),
            "lowercase": True,
            "stop_words": None
        },
        model_type="logistic_regression",
        model_params={
            "solver": "saga",
            "penalty": "l2",
            "C": 1.0,
            "max_iter": 2000,
            "random_state": 42
        }
    )

def create_word2vec_rf_config() -> PipelineConfig:
    """Create config for Word2Vec + Random Forest"""
    return PipelineConfig(
        vectorizer_type="word2vec",
        vectorizer_params={
            "vector_size": 100,
            "window": 5,
            "min_count": 1,
            "workers": 4
        },
        model_type="random_forest",
        model_params={
            "n_estimators": 100,
            "random_state": 42,
            "n_jobs": -1
        }
    )

def create_sentence_transformer_svm_config() -> PipelineConfig:
    """Create config for Sentence Transformer + SVM"""
    return PipelineConfig(
        vectorizer_type="sentence_transformer",
        vectorizer_params={
            "model_name": "all-MiniLM-L6-v2"
        },
        model_type="svm",
        model_params={
            "kernel": "rbf",
            "C": 1.0,
            "random_state": 42
        }
    )

# =============================================================================
# DEMONSTRATION
# =============================================================================

print("IBM Pipeline: Configurable Text Classification Pipeline")
print("=" * 60)
print("Available configurations:")
print("1. TF-IDF + Logistic Regression")
print("2. Word2Vec + Random Forest") 
print("3. Sentence Transformer + SVM")
print("=" * 60)


Shapes before SMOTE:
  Train: (8888, 5004)
  Test: (2335, 5004)
  Holdout: (1167, 5004)
Label distribution before SMOTE: {0: 7454, 1: 1434}
Train after SMOTE: (14908, 5004)
Label distribution after SMOTE: {0: 7454, 1: 7454}
Model trained.
Saved model to: models/tfidf_num_logreg.pkl
test accuracy: 0.979




In [None]:
# =============================================================================
# EXAMPLE 1: TF-IDF + Logistic Regression Pipeline
# =============================================================================

print("Example 1: TF-IDF + Logistic Regression")
print("-" * 50)

# Create configuration
config = create_tfidf_logreg_config()
config.output_dir = "models/tfidf_logreg_pipeline"

# Create and run pipeline
pipeline = TextClassificationPipeline(config)
pipeline.fit()

# Evaluate
results = pipeline.evaluate()
print(f"Test Accuracy: {results['accuracy']:.4f}")

# Save pipeline
pipeline.save()
print("Pipeline saved successfully!")


In [None]:
# =============================================================================
# EXAMPLE 2: Word2Vec + Random Forest Pipeline
# =============================================================================

print("Example 2: Word2Vec + Random Forest")
print("-" * 50)

# Create configuration
config = create_word2vec_rf_config()
config.output_dir = "models/word2vec_rf_pipeline"

# Create and run pipeline
pipeline = TextClassificationPipeline(config)
pipeline.fit()

# Evaluate
results = pipeline.evaluate()
print(f"Test Accuracy: {results['accuracy']:.4f}")

# Save pipeline
pipeline.save()
print("Pipeline saved successfully!")


In [None]:
# =============================================================================
# EXAMPLE 3: Sentence Transformer + SVM Pipeline
# =============================================================================

print("Example 3: Sentence Transformer + SVM")
print("-" * 50)

# Create configuration
config = create_sentence_transformer_svm_config()
config.output_dir = "models/sentence_transformer_svm_pipeline"

# Create and run pipeline
pipeline = TextClassificationPipeline(config)
pipeline.fit()

# Evaluate
results = pipeline.evaluate()
print(f"Test Accuracy: {results['accuracy']:.4f}")

# Save pipeline
pipeline.save()
print("Pipeline saved successfully!")


In [None]:
# =============================================================================
# EXAMPLE 4: Custom Configuration
# =============================================================================

print("Example 4: Custom Configuration")
print("-" * 50)

# Create custom configuration
custom_config = PipelineConfig(
    # Data paths
    train_path="dataset/feature_enginering/train_feature_engineered.csv",
    test_path="dataset/feature_enginering/test_feature_engineered.csv",
    holdout_path="dataset/feature_enginering/holdout_feature_engineered.csv",
    
    # Vectorizer settings
    vectorizer_type="tfidf",
    vectorizer_params={
        "max_features": 30000,
        "ngram_range": (1, 3),  # Use trigrams
        "lowercase": True,
        "stop_words": "english"  # Use English stop words
    },
    
    # Model settings
    model_type="random_forest",
    model_params={
        "n_estimators": 200,
        "max_depth": 10,
        "random_state": 42,
        "n_jobs": -1
    },
    
    # SMOTE settings
    use_smote=True,
    smote_params={
        "sampling_strategy": "minority",
        "random_state": 42,
        "k_neighbors": 5
    },
    
    # Output settings
    output_dir="models/custom_pipeline",
    save_artifacts=True
)

# Create and run pipeline
pipeline = TextClassificationPipeline(custom_config)
pipeline.fit()

# Evaluate
results = pipeline.evaluate()
print(f"Test Accuracy: {results['accuracy']:.4f}")

# Save pipeline
pipeline.save()
print("Custom pipeline saved successfully!")


In [None]:
# =============================================================================
# EXAMPLE 5: Loading and Using Saved Pipeline
# =============================================================================

print("Example 5: Loading and Using Saved Pipeline")
print("-" * 50)

# Load a saved pipeline
try:
    loaded_pipeline = TextClassificationPipeline.load("models/tfidf_logreg_pipeline")
    print("Pipeline loaded successfully!")
    
    # Make predictions on new data
    _, test_df, _ = loaded_pipeline._load_data()
    
    # Get predictions for first 5 test samples
    X_test, y_test = loaded_pipeline._prepare_features(test_df.head(5))
    predictions = loaded_pipeline.model.predict(X_test)
    probabilities = loaded_pipeline.model.predict_proba(X_test)
    
    print("\nSample predictions:")
    for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
        print(f"Sample {i+1}: Prediction={pred}, Probability={prob}")
        
except Exception as e:
    print(f"Error loading pipeline: {e}")
    print("Make sure to run Example 1 first to create the pipeline.")


In [None]:
# =============================================================================
# PIPELINE COMPARISON AND BENCHMARKING
# =============================================================================

print("Pipeline Comparison and Benchmarking")
print("=" * 60)

import time
from typing import List, Dict

def benchmark_pipeline(config: PipelineConfig, name: str) -> Dict[str, Any]:
    """Benchmark a pipeline configuration"""
    print(f"\nBenchmarking {name}...")
    start_time = time.time()
    
    try:
        # Create pipeline
        pipeline = TextClassificationPipeline(config)
        
        # Fit pipeline
        fit_start = time.time()
        pipeline.fit()
        fit_time = time.time() - fit_start
        
        # Evaluate pipeline
        eval_start = time.time()
        results = pipeline.evaluate()
        eval_time = time.time() - eval_start
        
        total_time = time.time() - start_time
        
        return {
            "name": name,
            "accuracy": results["accuracy"],
            "fit_time": fit_time,
            "eval_time": eval_time,
            "total_time": total_time,
            "success": True
        }
        
    except Exception as e:
        return {
            "name": name,
            "accuracy": 0.0,
            "fit_time": 0.0,
            "eval_time": 0.0,
            "total_time": time.time() - start_time,
            "success": False,
            "error": str(e)
        }

# Define benchmark configurations
benchmark_configs = [
    (create_tfidf_logreg_config(), "TF-IDF + Logistic Regression"),
    (create_word2vec_rf_config(), "Word2Vec + Random Forest"),
    (create_sentence_transformer_svm_config(), "Sentence Transformer + SVM")
]

# Run benchmarks
results = []
for config, name in benchmark_configs:
    config.output_dir = f"models/benchmark_{name.lower().replace(' ', '_').replace('+', '_')}"
    result = benchmark_pipeline(config, name)
    results.append(result)

# Display results
print("\n" + "=" * 80)
print("BENCHMARK RESULTS")
print("=" * 80)
print(f"{'Pipeline':<40} {'Accuracy':<10} {'Fit Time':<10} {'Eval Time':<10} {'Total Time':<10} {'Status':<10}")
print("-" * 80)

for result in results:
    if result["success"]:
        print(f"{result['name']:<40} {result['accuracy']:<10.4f} {result['fit_time']:<10.2f} {result['eval_time']:<10.2f} {result['total_time']:<10.2f} {'SUCCESS':<10}")
    else:
        print(f"{result['name']:<40} {'N/A':<10} {'N/A':<10} {'N/A':<10} {'N/A':<10} {'FAILED':<10}")
        print(f"  Error: {result.get('error', 'Unknown error')}")

print("=" * 80)


# IBM Pipeline: Configurable Text Classification Pipeline

## Overview
This pipeline provides a flexible, modular approach to text classification with the following features:

### ✅ **Configurable Components**
- **Vectorizers**: TF-IDF, Word2Vec, Sentence Transformers
- **Models**: Logistic Regression, Random Forest, SVM (easily extensible)
- **Feature Engineering**: Automatic text feature extraction
- **Balancing**: SMOTE for handling class imbalance

### ✅ **Key Features**
1. **Modular Design**: Easy to swap vectorizers and models
2. **Automatic Feature Engineering**: Computes text length, word count, emoji count, foreign characters
3. **SMOTE Integration**: Handles class imbalance automatically
4. **Model Persistence**: Save and load complete pipelines
5. **Comprehensive Evaluation**: Accuracy, classification report, confusion matrix
6. **Benchmarking**: Compare different configurations

### ✅ **Usage Examples**

#### Basic Usage
```python
# Create configuration
config = create_tfidf_logreg_config()
config.output_dir = "models/my_pipeline"

# Create and run pipeline
pipeline = TextClassificationPipeline(config)
pipeline.fit()
results = pipeline.evaluate()
pipeline.save()
```

#### Custom Configuration
```python
# Custom configuration
config = PipelineConfig(
    vectorizer_type="tfidf",
    vectorizer_params={"max_features": 30000, "ngram_range": (1, 3)},
    model_type="random_forest",
    model_params={"n_estimators": 200, "max_depth": 10},
    use_smote=True,
    output_dir="models/custom_pipeline"
)

pipeline = TextClassificationPipeline(config)
pipeline.fit()
```

#### Loading Saved Pipeline
```python
# Load saved pipeline
pipeline = TextClassificationPipeline.load("models/my_pipeline")

# Make predictions
predictions = pipeline.model.predict(X_new)
probabilities = pipeline.model.predict_proba(X_new)
```

### ✅ **Supported Vectorizers**
- **TF-IDF**: Traditional bag-of-words with n-grams
- **Word2Vec**: Word embeddings with averaging
- **Sentence Transformers**: Pre-trained sentence embeddings

### ✅ **Supported Models**
- **Logistic Regression**: Fast, interpretable
- **Random Forest**: Robust, handles non-linearity
- **SVM**: Good for high-dimensional data

### ✅ **Pipeline Flow**
1. **Data Loading** → Load train/test/holdout datasets
2. **Feature Engineering** → Add text features (length, word count, emojis, etc.)
3. **Vectorization** → Convert text to numerical features
4. **Scaling** → Normalize numeric features
5. **SMOTE** → Balance classes (optional)
6. **Training** → Fit the model
7. **Evaluation** → Test performance
8. **Persistence** → Save all artifacts

### ✅ **Output Files**
Each pipeline saves:
- `{vectorizer_type}_vectorizer.pkl` - Fitted vectorizer
- `feature_scaler.pkl` - Feature scaler
- `{model_type}_{vectorizer_type}_model.pkl` - Trained model
- `pipeline_config.pkl` - Configuration used

### ✅ **Extending the Pipeline**
To add new vectorizers or models, simply:
1. Inherit from `BaseVectorizer` or `BaseModel`
2. Implement required methods
3. Add to the factory methods in `TextClassificationPipeline`

This pipeline is designed for production use with proper error handling, logging, and modularity.
