# Reusable Code Chunks for ML Project App

## Environment Preparation

In [1]:
!pip uninstall torch

Found existing installation: torch 2.5.1+cu124
Uninstalling torch-2.5.1+cu124:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/bin/torchfrtrace
    /usr/local/bin/torchrun
    /usr/local/lib/python3.11/dist-packages/functorch/*
    /usr/local/lib/python3.11/dist-packages/torch-2.5.1+cu124.dist-info/*
    /usr/local/lib/python3.11/dist-packages/torch/*
    /usr/local/lib/python3.11/dist-packages/torchgen/*
Proceed (Y/n)? y
  Successfully uninstalled torch-2.5.1+cu124


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu125

Looking in indexes: https://download.pytorch.org/whl/cu125
[31mERROR: Could not find a version that satisfies the requirement torch (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch[0m[31m
[0m

In [3]:
!pip install langdetect
!pip install datasets
!pip install wandb
!pip install fastapi
!pip install "fastapi[standard]"
!pip install gradio
!pip install streamlit
!pip install uvicorn
!pip install dotenv



In [4]:
import os
import sys

from dotenv import load_dotenv

In [5]:
IN_COLAB = 'google.colab' in sys.modules

In [6]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    DATASETS_PATH = '/content/drive/MyDrive/customer-support-tickets/datasets'
    MODELS_PATH = '/content/drive/MyDrive/customer-support-tickets/models'
else:
    DATASETS_PATH = './customer-support-tickets/datasets'
    MODELS_PATH = './customer-support-tickets/models'

    load_dotenv()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Backend

### File preprocessing.py

In [7]:
import os
import sys

from typing import List, Tuple, Dict, Any, Optional, Union

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk
import spacy

class EmailPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        pass

    def fit(self, df: pd.DataFrame):
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        prep_steps = [
            self.filter_language,
            self.fill_missing_values,
            self.attach_subject_body,
            self.extract_features
        ]

        for step in prep_steps:
            df_prep = step(df_prep)

        return df_prep

    def filter_language(self, df: pd.DataFrame, lang: str = 'en') -> pd.DataFrame:
        if 'language' not in df.columns:
            df['language'] = df['text'].apply(lambda x: self.detect_language(x))

        df_filtered = df[df['language'] == lang].copy()

        return df_filtered

    def fill_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
        df_filled = df.copy()
        df_filled['subject'] = df_filled['subject'].fillna('no subject')
        df_filled['body'] = df_filled['body'].fillna('no body')

        return df_filled

    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        df_featured = df.copy()

        df_featured['subject_length'] = df_featured['subject'].apply(lambda x: len(x))
        df_featured['body_length'] = df_featured['body'].apply(lambda x: len(x))

        return df_featured

    def attach_subject_body(self, df: pd.DataFrame) -> pd.DataFrame:
        df_attached = df.copy()
        df_attached['text'] = df_attached.apply(
            lambda row: f"Subject: {row['subject']}\nBody: {row['body']}",
            axis=1
        )

        return df_attached

    def detect_language(self, text: str, lang: str = 'en') -> str:
        try:
            return langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException:
            return 'unknown'

class ResamplingPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        label_columns: str | List[str],
        resample_mode: str = 'undersample',
        random_state: int = 42,
        n_samples_each_category: int = None
    ) -> None:
        if type(label_columns) == str:
            self.label_columns = [label_columns]
        else:
            self.label_columns = label_columns

        self.random_state = random_state
        self.resample_mode = resample_mode
        self.n_samples_each_category = n_samples_each_category

    def fit(self, df: pd.DataFrame):
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        prep_steps = [
            self.resample
        ]

        for step in prep_steps:
            df_prep = step(df_prep)

        return df_prep

    def resample(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        if self.resample_mode == 'undersample':
            df_prep = self.undersample_examples(df=df_prep)
        elif self.resample_mode == 'oversample':
            df_prep = self.oversample_examples(df=df_prep)

        return df_prep

    def undersample_examples(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Undersamples a DataFrame based on a subset of columns to create a balanced dataset.

        Args:
            columns (List[str]): The columns to group the DataFrame by for undersampling.
            n_samples_each_category (int, optional): The number of samples to keep from each category. If None, all samples from each category will be kept. Defaults to None.
            random_state (int, optional): The random state for reproducibility. Defaults to 42.
        """
        df_prep = df.copy()

        if self.n_samples_each_category is None:
            n_samples = df_prep.groupby(self.label_columns).size().min()
        else:
            n_samples = self.n_samples_each_category

        undersampled_dfs = df_prep.groupby(self.label_columns).sample(n=n_samples, replace=False, random_state=self.random_state)

        return undersampled_dfs

    def oversample_examples(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Oversamples a DataFrame based on a subset of columns to create a balanced dataset.

        Args:
            columns (List[str]): The columns to group the DataFrame by for oversampling.
            random_state (int, optional): The random state for reproducibility. Defaults to 42.
        """
        df_prep = df.copy()

        max_samples = df_prep.groupby(self.label_columns).size().max()
        oversampled_dfs = []

        for _, group_df in df_prep.groupby(self.label_columns):
            n_samples = max_samples - len(group_df)

            oversampled_dfs.append(group_df.sample(n=n_samples, replace=True, random_state=self.random_state))

        oversampled_dfs = pd.concat(oversampled_dfs)

        return oversampled_dfs

class SplitterPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        split_mode: str = 'train_val_test',
        retrieve: str = 'all',
        test_size: float = 0.2,
        val_size: float = 0.2,
        random_state: int = 42
    ) -> None:
        self.split_mode = split_mode
        self.retrieve = retrieve

        self.test_size = test_size
        self.val_size = val_size

        self.random_state = random_state

    def fit(self, df: pd.DataFrame):
        return self

    def transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        df_prep = df.copy()

        if self.split_mode == 'train_val_test':
            df_train, df_val, df_test = self.split_train_val_test(df_prep)
        elif self.split_mode == 'train_test':
            df_train, df_test = self.split_train_test(df_prep)
        elif self.split_mode == 'train_val':
            df_train, df_val = self.split_train_val(df_prep)
        else:
            raise ValueError(f'Invalid value for split_mode: {self.split_mode}')

        if self.retrieve == 'all':
            return df_train, df_val, df_test
        elif self.retrieve == 'train':
            return df_train
        elif self.retrieve == 'val':
            return df_val
        elif self.retrieve == 'test':
            return df_test
        else:
            raise ValueError(f'Invalid value for retrieve: {self.retrieve}')

    def split_train_val_test(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        df_train_val, df_test = train_test_split(df, test_size=self.test_size, random_state=self.random_state)
        df_train, df_val = train_test_split(df_train_val, test_size=self.val_size, random_state=self.random_state)

        return df_train, df_val, df_test

    def split_train_test(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        df_train, df_test = train_test_split(df, test_size=self.test_size, random_state=self.random_state)

        return df_train, df_test

    def split_train_val(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        df_train, df_val = train_test_split(df, test_size=self.val_size, random_state=self.random_state)

        return df_train, df_val

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        stopwords: List[str] = None,
        flg_stemm: bool = False,
        flg_lemm: bool = False,
        flg_stopwords: bool = True,
        flg_punctuation: bool = True,
        flg_numbers: bool = True
    ) -> None:
        self.flg_stemm = flg_stemm
        self.flg_lemm = flg_lemm
        self.flg_stopwords = flg_stopwords
        self.flg_punctuation = flg_punctuation
        self.flg_numbers = flg_numbers

        self.load_stopwords(stopwords=stopwords)

    def fit(self, df: pd.DataFrame):
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()
        df_prep['text'] = df_prep['text'].apply(lambda x: self.preprocess_text(x))

        return df_prep

    def load_stopwords(
        self,
        stopwords: List[str] = None,
        lang: str = 'english',
        source: str = 'nltk',
        file_path: str = ''
    ):
        if stopwords is not None:
            self.stopwords = stopwords
        elif source == 'nltk':
            nltk.download('stopwords')
            nltk.download('wordnet')

            self.stopwords = nltk.corpus.stopwords.words(lang)
        elif source == 'spacy':
            spacy.load('en_core_web_sm')

            self.stopwords = spacy.load(lang).Defaults.stop_words
        elif source == 'file':
            try:
                with open(file_path, 'r') as f:
                    self.stopwords = f.read().splitlines()
            except FileNotFoundError:
                print(f'File not found: {file_path}')
                self.stopwords = []
        else:
            self.stopwords = []

    def preprocess_text(self, text: str) -> str:
        import unicodedata

        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        text = text.lower()

        if self.flg_punctuation:
            text = re.sub(r'[^\w\s]', '', text)

        if self.flg_numbers:
            text = re.sub(r'\d+', '', text)

        text = re.sub(r'\s{2,}', ' ', text)

        if self.flg_stemm:
            ps = nltk.stem.porter.PorterStemmer()
            text = ' '.join([ps.stem(word) for word in text.split()])

        if self.flg_lemm:
            lem = nltk.stem.wordnet.WordNetLemmatizer()
            text = ' '.join([lem.lemmatize(word) for word in text.split()])

        if self.flg_stopwords:
            stopwords = nltk.corpus.stopwords.words('english')
            text = ' '.join([word for word in text.split() if word not in stopwords])

        text = text.strip()

        return text

class LabelPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, label_column_name: str, encoder_mode: str = 'label') -> None:
        self.label_column_name = label_column_name
        self.encoder_mode = encoder_mode

        if self.encoder_mode == 'label':
            self.encoder = LabelEncoder()
        elif self.encoder_mode == 'onehot':
            self.encoder = OneHotEncoder()

    def fit(self, df: pd.DataFrame):
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        if self.label_column_name in df_prep.columns:
            prep_steps = [
                self.set_label,
                self.encode
            ]

            for step in prep_steps:
                df_prep = step(df_prep)

        return df_prep

    def set_label(self, df: pd.DataFrame):
        df_prep = df.copy()
        df_prep['label'] = df_prep[self.label_column_name]

        return df_prep

    def encode(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        df_prep['label'] = self.encoder.fit_transform(df_prep['label'])

        return df_prep

    def decode(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        df_prep['label'] = self.encoder.inverse_transform(df_prep['label'])

        return df_prep

class VectorizerPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        from_file: bool = False,
        file_path: str = '',
        vectorizer_mode: str = 'TfIdfVectorizer'
    ) -> None:
        self.from_file = from_file
        self.file_path = file_path
        self.vectorizer_mode = vectorizer_mode

        if from_file:
            try:
                self.vectorizer = self.load_vectorizer(file_path)
                self.vectorizer_mode = self.vectorizer.__class__.__name__
            except FileNotFoundError:
                self.vectorizer = TfidfVectorizer(ngram_range=(1, 1), smooth_idf=True, use_idf=True)
                self.vectorizer_mode = self.vectorizer.__class__.__name__

        elif self.vectorizer_mode == 'TfIdfVectorizer':
            self.vectorizer = TfidfVectorizer(ngram_range=(1, 1), smooth_idf=True, use_idf=True)
        elif self.vectorizer_mode == 'CountVectorizer':
            self.vectorizer = CountVectorizer()
        else:
            raise ValueError(f'Invalid value for vectorizer_mode: {self.vectorizer_mode}')

    def fit(self, df: pd.DataFrame):
        df_prep = df.copy()

        if self.vectorizer:
            self.vectorizer.fit(df_prep['text'])

        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        if self.vectorizer:
            # Get the sparse matrix from the vectorizer
            feature_matrix = self.vectorizer.transform(df_prep['text'])
            # Convert sparse matrix to dense array and create a DataFrame
            # feature_df = pd.DataFrame(feature_matrix.toarray())
            feature_df = pd.DataFrame(feature_matrix.toarray(), columns=self.vectorizer.get_feature_names_out())

            for col in df_prep.columns:
                if col in feature_df.columns:
                    df_prep.rename(columns={col: f'{col}_original'}, inplace=True)

            # Concatenate feature DataFrame with original DataFrame
            df_prep = pd.concat(
                [df_prep.reset_index(drop=True), feature_df],
                axis=1
            ) # Concatenate feature columns

        else:
            df_prep['features'] = df_prep['text']

        return df_prep

    def get_feature_names(self):
        return self.vectorizer.get_feature_names_out()

    def get_vocabulary(self):
        return self.vectorizer.vocabulary_

    def load_vectorizer(self, file_path: str):
        try:
            with open(file_path, 'rb') as f:
                vectorizer = pickle.load(f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
            vectorizer = None

        return vectorizer

    def save_vectorizer(self, file_path: str):
        try:
            with open(file_path, 'wb') as f:
                pickle.dump(self.vectorizer, f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
            vectorizer = None

### File models.py

In [8]:
import os
import sys
import pickle

from typing import List, Tuple, Dict, Any, Optional, Union

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, roc_auc_score, roc_curve

import nltk
import spacy

from datasets import Dataset

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

import wandb

class BaselineModel(BaseEstimator, ClassifierMixin):
    model_mapping = {
        'MultinomialNB': MultinomialNB,
        'BernoulliNB': BernoulliNB,
        'GaussianNB': GaussianNB,
        'LogisticRegression': LogisticRegression,
        'KNeighborsClassifier': KNeighborsClassifier,
        'SVC': SVC,
        'DecisionTreeClassifier': DecisionTreeClassifier,
        'RandomForestClassifier': RandomForestClassifier,
        'GradientBoostingClassifier': GradientBoostingClassifier
    }

    def __init__(
        self,
        from_file: bool = False,
        file_path: str = '',
        model: Union[str, BaseEstimator] = 'LogisticRegression',
        input_column_names_numeric: bool = False,
        input_column_names: List[str] | str = 'text',
        output_column_name: str = 'label',
        device: str = 'cpu',
        **kwargs
    ) -> None:
        if from_file:
            try:
                self.model = self.load_model(file_path)
                self.model_name = self.model.__class__.__name__
            except FileNotFoundError:
                self.model = self.model_mapping[model](**kwargs)
                self.model_name = model
        elif type(model) == str:
            self.model = self.model_mapping[model](**kwargs)
            self.model_name = model
        else:
            self.model = model
            self.model_name = model.__class__.__name__

        self.device = device

        self.kwargs = kwargs

        self.input_column_names_numeric = input_column_names_numeric # Store the argument as an attribute

        if input_column_names_numeric:
            self.input_column_names = None
        elif type(input_column_names) == str:
            self.input_column_names = [input_column_names]
        else:
            self.input_column_names = input_column_names

        self.output_column_name = output_column_name

    def get_numeric_columns(self, df: pd.DataFrame):
        return [col for col in df.columns if str(col).isnumeric()]

    def fit(self, df: pd.DataFrame, y: pd.Series = None):
        df_prep = df.copy()

        if self.input_column_names_numeric and self.input_column_names is None:
            self.input_column_names = self.get_numeric_columns(df_prep)

        X = df_prep[self.input_column_names]
        y = y if y is not None else df_prep[self.output_column_name]

        self.model.fit(X, y)

        return self

    def predict(self, df: pd.DataFrame):
        df_prep = df.copy()

        if self.input_column_names_numeric and self.input_column_names is None:
            self.input_column_names = self.get_numeric_columns(df_prep)

        X = df_prep[self.input_column_names]

        return self.model.predict(X)

    def score(self, df: pd.DataFrame, y: pd.Series = None):
        df_prep = df.copy()

        if self.input_column_names_numeric and self.input_column_names is None:
            self.input_column_names = self.get_numeric_columns(df_prep)

        X = df_prep[self.input_column_names]
        y = y if y is not None else df_prep[self.output_column_name]

        return self.model.score(X, y)

    def predict_proba(self, df: pd.DataFrame):
        df_prep = df.copy()

        if self.input_column_names_numeric and self.input_column_names is None:
            self.input_column_names = self.get_numeric_columns(df_prep)

        X = df_prep[self.input_column_names]

        return self.model.predict_proba(X)

    def get_params(self, deep: bool = True):
        return self.model.get_params(deep)

    def set_params(self, **params):
        return self.model.set_params(**params)

    def load_model(self, file_path: str):
        try:
            with open(file_path, 'rb') as f:
                model = pickle.load(f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
            model = None

        return model

    def save_model(self, file_path: str):
        try:
            with open(file_path, 'wb') as f:
                pickle.dump(self.model, f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
            model = None

class TransformerModel(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        load_path: str = 'bert-base-uncased',
        save_path: str = './saved_models',
        run_name: str = 'bert-base-uncased',
        input_column_name: str = 'text',
        output_column_name: str = 'label',
        num_labels: int = 2,
        device: str = 'cpu',
        output_dir: str = './results',
        logging_dir: str = './logs',
        epochs: int=3,
        per_device_train_batch_size: int = 4,
        per_device_eval_batch_size: int = 4,
        learning_rate: float = 2e-5,
        weight_decay: float = 0.01
    ) -> None:
        self.load_path = load_path
        self.save_path = save_path

        self.num_labels = num_labels
        self.device = device

        self.load_models()

        self.run_name = run_name

        self.input_column_name = input_column_name
        self.output_column_name = output_column_name

        self.output_dir = output_dir
        self.logging_dir = logging_dir

        self.epochs = epochs
        self.per_device_train_batch_size = per_device_train_batch_size
        self.per_device_eval_batch_size = per_device_eval_batch_size
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay

    def load_models(self):
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.load_path, use_fast=True)
        except Exception:
            self.tokenizer = AutoTokenizer.from_pretrained(self.load_path, use_fast=False)

        self.model = AutoModelForSequenceClassification.from_pretrained(self.load_path, num_labels=self.num_labels).to(self.device)

    def save_models(self):
        self.trainer.save_model(self.save_path)
        self.tokenizer.save_pretrained(self.save_path)
        self.model.save_pretrained(self.save_path)

    def tokenize(self, texts: Dict[str, list], max_length: int = 128):
        return self.tokenizer(texts, padding=True, truncation=True, max_length=max_length)

    def compute_intermediate_metrics(self, eval_pred: Tuple[np.ndarray, np.ndarray]) -> Dict[str, float]:
        metrics = {}

        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        metrics['accuracy'] = accuracy_score(y_true=labels, y_pred=predictions)
        metrics['precision'], metrics['recall'], metrics['f1'], _ = precision_recall_fscore_support(y_true=labels, y_pred=predictions, average="weighted")

        return metrics

    def fit(self, df_train: pd.DataFrame, df_val: pd.DataFrame, y: pd.Series = None):
        train_dataset = Dataset.from_pandas(pd.DataFrame(
            {
                'text': df_train[self.input_column_name],
                'label': df_train[self.output_column_name]
            }
        ))
        val_dataset = Dataset.from_pandas(pd.DataFrame(
            {
                'text': df_val[self.input_column_name],
                'label': df_val[self.output_column_name]
            }
        ))

        tokenized_train_dataset = train_dataset.map(self.tokenize, batched=True)
        tokenized_val_dataset = val_dataset.map(self.tokenize, batched=True)

        training_args = TrainingArguments(
            run_name=self.run_name,

            output_dir=self.output_dir,
            logging_dir=self.logging_dir,

            report_to="wandb",

            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=self.per_device_eval_batch_size,
            learning_rate=self.learning_rate,
            weight_decay=self.weight_decay,

            eval_strategy="epoch",
            save_strategy="epoch",

            load_best_model_at_end=True
        )

        # Create Trainer instance
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_val_dataset,
            processing_class=self.tokenizer,
            compute_metrics=self.compute_intermediate_metrics
        )

        # Train the model
        trainer.train()

        # Evaluate the model
        eval_results = trainer.evaluate()

        self.trainer = trainer

        # [optional] Finish the wandb run, necessary in notebooks
        wandb.finish()

    def compute_predictions_details(self, df: pd.DataFrame):
        df_prep = df.copy()

        # Tokenize and prepare input
        df_prep['embeddings'] = df_prep[self.input_column_name].apply(
            lambda text: self.tokenizer(
                text=text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.device)
        )

        # Ensure model is in evaluation mode
        self.model.eval()

        # Run inference
        with torch.no_grad():
            df_prep['logits'] = df_prep['embeddings'].apply(lambda inputs: self.model(**inputs).logits)
            df_prep['predictions'] = df_prep['logits'].apply(lambda logits: torch.argmax(logits, dim=-1).item())

        return df_prep


    def predict(self, df: pd.DataFrame):
        df_prep = df.copy()

        df_prep = self.compute_predictions_details(df_prep)

        return df_prep['predictions']

    def predict_proba(self, df: pd.DataFrame):
        df_prep = df.copy()

        df_prep = self.compute_predictions_details(df_prep)

        return df_prep['logits']

ModuleNotFoundError: No module named 'torch'

### File evaluation.py

In [None]:
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, roc_auc_score, roc_curve

class MetricsEvaluator(BaseEstimator, TransformerMixin):
    def __init__(self, label_column_name: str = 'label') -> None:
        self.label_column_name = label_column_name

    def fit(self, df: pd.DataFrame):
        return self

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df_prep = df.copy()

        df_prep = self.calculate_metrics(df_prep)

        return df_prep

    def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
        metrics = {}

        metrics['confusion_matrix'] = confusion_matrix(df[self.label_column_name], df['prediction'])
        metrics['classification_report'] = classification_report(df[self.label_column_name], df['prediction'])

        metrics['accuracy'] = accuracy_score(df[self.label_column_name], df['prediction'])
        metrics['precision'], metrics['recall'], metrics['f1_score'], _ = precision_recall_fscore_support(df[self.label_column_name], df['prediction'], average='weighted')

        return metrics

### File pipelines.py

In [None]:
import os
import sys
import pickle

from typing import List, Tuple, Dict, Any, Optional, Union

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, roc_auc_score, roc_curve

# from src.backend.evaluation import MetricsEvaluator

import nltk
import spacy

class PipelineModules(Pipeline):
    def __init__(
        self,
        from_file: bool = False,
        file_path: str = '',
        steps: Tuple[str, Any] = None,
        memory: str = None,
        device: str = 'cpu',
        verbose: bool = False
    ) -> None:
        if from_file:
            try:
                self.steps = self.load_pipeline(file_path)
            except FileNotFoundError:
                super().__init__(steps, memory=memory, verbose=verbose)
                self.steps = steps
        else:
            super().__init__(steps, memory=memory, verbose=verbose)
            self.steps = steps

        self.device = device

        if self.steps and len(self.steps) > 0:
            print(self.steps)
            self.pipeline_transformers = self.get_pipeline_transformers()
            self.classifier = self.get_classifier()

    def get_pipeline_transformers(self):
        transformers = []

        for step_name, step in self.steps:
            if issubclass(type(step), TransformerMixin):
                transformers.append((step_name, step))

        transformers = Pipeline(transformers)

        return transformers

    def get_classifier(self):
        classifiers = []

        for step_name, step in self.steps:
            if issubclass(type(step), ClassifierMixin):
                classifiers.append((step_name, step))
                # if isinstance(type(step), TransformerModel):
                #     step.device = self.device
                #     model = TransformerModel(**step.kwargs)
                #     processed_step = processed_step.load_models()
                #     classifiers.append((step_name, processed_step))
                # else:
                #     classifiers.append((step_name, step))

        if len(classifiers) == 0:
            raise ValueError('No classifier found in the pipeline')
        else:
            classifier = classifiers[0][1]

        return classifier

    def get_feature_names_out(self, input_features=None):
        return super().get_feature_names_out(input_features)

    def fit(self, df: pd.DataFrame, y: pd.Series = None):
        df_prep = df.copy()

        df_prep = self.pipeline_transformers.fit_transform(df_prep)

        features_columns = list(self.pipeline_transformers.named_steps['vectorizer'].get_feature_names())

        self.classifier.input_column_names = features_columns

        self.classifier.fit(df_prep)

        return df_prep

    def transform(self, df: pd.DataFrame):
        df_prep = df.copy()

        df_prep = self.pipeline_transformers.transform(df_prep)

        return df_prep

    def fit_transform(self, df: pd.DataFrame, y: pd.Series = None):
        raise NotImplementedError('PipelineModules does not support fit_transform method')

    def predict(self, df: pd.DataFrame):
        df_prep = df.copy()

        df_prep['prediction'] = self.classifier.predict(df_prep)

        return df_prep

    def predict_proba(self, df: pd.DataFrame):
        df_prep = df.copy()

        # Get probabilities for each class
        probabilities = self.classifier.predict_proba(df_prep)

        # Assuming probabilities is a 2D array, get the probabilities of the predicted class
        predicted_class_probs = probabilities[np.arange(probabilities.shape[0]), self.classifier.predict(df_prep)]

        # Create a new column for the probabilities of the predicted class
        df_prep['prediction_proba'] = predicted_class_probs

        return df_prep

    def evaluate(self, df: pd.DataFrame):
        df_prep = df.copy()

        evaluator = MetricsEvaluator()

        metrics = evaluator.fit_transform(df_prep)

        return metrics

    def load_pipeline(self, file_path: str):
        try:
            with open(file_path, 'rb') as f:
                pipeline = pickle.load(f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
            pipeline = None

        return pipeline

    def save_pipeline(self, file_path: str):
        try:
            with open(file_path, 'wb') as f:
                pickle.dump(self, f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
            pipeline = None

## Utils

### File pydantic_models.py

In [None]:
from pydantic import BaseModel

class EmailInput(BaseModel):
    subject: str
    body: str
    model_choice: str  # e.g., "nb", "lr", "distilbert", "bert"

class PredictionResponse(BaseModel):
    queue: str
    priority: str
    details: dict

### File labels.py

In [None]:
label_queue_values = [
    'Customer Service',
    'Technical Support',
    'IT Support',
    'Product Support',
    'Billing and Payments',
    'Service Outages and Maintenance',
    'Human Resources',
    'Returns and Exchanges',
    'Sales and Pre-Sales',
    'General Inquiry'
]

label_priority_values = [
    'Medium',
    'High',
    'Low'
]

### File devices.py

In [None]:
import torch

def get_available_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'

## API

### File main.py

In [None]:
import os

os.environ["NB_PIPELINE_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/nb_pipeline_queue.pkl"
os.environ["NB_PIPELINE_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/nb_pipeline_priority.pkl"
os.environ["LR_PIPELINE_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/lr_pipeline_queue.pkl"
os.environ["LR_PIPELINE_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/lr_pipeline_priority.pkl"
os.environ["BERT_PIPELINE_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/transformers/bert-base-uncased_on_queue_for_3_epochs"
os.environ["BERT_PIPELINE_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/transformers/bert-base-uncased_on_priority_for_3_epochs"

os.environ["API_URL"] = "http://localhost:8000/predict"

os.environ["VECTORIZER_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/vectorizer.pkl"
os.environ["NB_MODEL_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/nb_model.pkl"
os.environ["NB_MODEL_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/nb_model.pkl"
os.environ["LR_MODEL_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/lr_model.pkl"
os.environ["LR_MODEL_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/baseline/lr_model.pkl"
os.environ["BERT_TRANSFORMERS_MODEL_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/transformers/bert-base-uncased_on_queue_for_3_epochs"
os.environ["BERT_TRANSFORMERS_MODEL_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/transformers/bert-base-uncased_on_priority_for_3_epochs"
os.environ["DISTILBERT_TRANSFORMERS_MODEL_QUEUE_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/transformers/distilbert-base-uncased_on_queue_for_3_epochs"
os.environ["DISTILBERT_TRANSFORMERS_MODEL_PRIORITY_PATH"] = "/content/drive/MyDrive/customer-support-tickets/models/transformers/distilbert-base-uncased_on_priority_for_3_epochs"

os.environ["LOAD_MODE"] = 'model'
os.environ["FIT_FLG"] = 'True'

In [None]:
import os

import pandas as pd

from fastapi import FastAPI, HTTPException
import uvicorn

# from src.backend.evaluation import MetricsEvaluator
# from src.backend.preprocessing import SplitterPreprocessor, EmailPreprocessor, ResamplingPreprocessor, TextPreprocessor, LabelPreprocessor, VectorizerPreprocessor
# from src.backend.models import BaselineModel, TransformerModel
# from src.backend.pipelines import PipelineModules
# from src.utils.pydantic_models import EmailInput, PredictionResponse
# from src.utils.labels import label_queue_values, label_priority_values
# from src.utils.devices import get_available_device
# from datasets.data_examples import email_examples
from data_examples import email_examples

import wandb

# from dotenv import load_dotenv

wandb.login()
load_dotenv()

data_df = pd.DataFrame(email_examples)

splitter = SplitterPreprocessor(retrieve='all')

data_prep_train_df, data_prep_val_df, data_prep_text_df = splitter.fit_transform(data_df)


app = FastAPI(title="Customer IT Support Prediction API")


def start():
    """Launched with `poetry run start` at root level"""
    uvicorn.run("my_package.main:app", host="0.0.0.0", port=8000, reload=True)

@app.post("/predict", response_model=PredictionResponse)
def predict(email: EmailInput):
    LOAD_MODE = os.getenv('LOAD_MODE')
    FIT_FLG = eval(os.getenv('FIT_FLG'))

    # Preprocess the input text
    dict_email = {
        'language': 'en',
        'subject': email.subject,
        'body': email.body
    }
    df_email = pd.DataFrame(dict_email, index=[0])

    if email.model_choice.lower() in ["nb", "lr"]:
        if LOAD_MODE == 'model':
            vectorizer_path = os.getenv('VECTORIZER_PATH')
            vectorizer = VectorizerPreprocessor(from_file=True, file_path=vectorizer_path)

        if email.model_choice.lower() == 'nb':
            if LOAD_MODE == 'pipeline':
                nb_pipeline_queue_path = os.getenv('NB_PIPELINE_QUEUE_PATH')
                nb_pipeline_priority_path = os.getenv('NB_PIPELINE_PRIORITY_PATH')

                pipeline_queue = PipelineModules(
                    from_file=True,
                    file_path=nb_pipeline_queue_path
                )

                pipeline_priority = PipelineModules(
                    from_file=True,
                    file_path=nb_pipeline_priority_path
                )
            elif LOAD_MODE == 'model':
                nb_model_queue_path = os.getenv('NB_MODEL_QUEUE_PATH')
                nb_model_priority_path = os.getenv('NB_MODEL_PRIORITY_PATH')

                model_queue = BaselineModel(from_file=True, file_path=nb_model_queue_path)
                model_priority = BaselineModel(from_file=True, file_path=nb_model_priority_path)

                pipeline_queue = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('vectorizer', vectorizer),
                    ('classifier', model_queue)
                ])

                pipeline_priority = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('vectorizer', vectorizer),
                    ('classifier', model_priority)
                ])

        elif email.model_choice.lower() == 'lr':
            if LOAD_MODE == 'pipeline':
                lr_pipeline_queue_path = os.getenv('LR_PIPELINE_QUEUE_PATH')
                lr_pipeline_priority_path = os.getenv('LR_PIPELINE_PRIORITY_PATH')

                pipeline_queue = PipelineModules(
                    from_file=True,
                    file_path=lr_pipeline_queue_path
                )

                pipeline_priority = PipelineModules(
                    from_file=True,
                    file_path=lr_pipeline_priority_path
                )
            elif LOAD_MODE == 'model':
                lr_model_queue_path = os.getenv('LR_MODEL_QUEUE_PATH')
                lr_model_priority_path = os.getenv('LR_MODEL_PRIORITY_PATH')

                model_queue = BaselineModel(model_path=lr_model_queue_path)
                model_priority = BaselineModel(model_path=lr_model_priority_path)

                pipeline_queue = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('vectorizer', vectorizer),
                    ('classifier', model_queue)
                ])

                pipeline_priority = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('vectorizer', vectorizer),
                    ('classifier', model_priority)
                ])

        if pipeline_queue.steps is None or pipeline_priority.steps is None:
            raise HTTPException(status_code=400, detail="Pipeline not found")

        if FIT_FLG:
            pipeline_queue = PipelineModules(steps=[
                ('email_preprocessor', EmailPreprocessor()),
                ('text_preprocessor', TextPreprocessor()),
                ('label_preprocessor', LabelPreprocessor(label_column_name='queue')),
                ('vectorizer', VectorizerPreprocessor()),
                ('classifier', BaselineModel(model='LogisticRegression')) if email.model_choice.lower() == 'lr' else BaselineModel(model='MultinomialNB')
            ])
            pipeline_queue.fit(data_prep_train_df)

            pipeline_priority = PipelineModules(steps=[
                ('email_preprocessor', EmailPreprocessor()),
                ('text_preprocessor', TextPreprocessor()),
                ('label_preprocessor', LabelPreprocessor(label_column_name='priority')),
                ('vectorizer', VectorizerPreprocessor()),
                ('classifier', BaselineModel(model='LogisticRegression')) if email.model_choice.lower() == 'lr' else BaselineModel(model='MultinomialNB')
            ])
            pipeline_priority.fit(data_prep_train_df)

    elif "bert" in email.model_choice.lower():
        device = get_available_device()

        if email.model_choice.lower() == 'bert':
            if LOAD_MODE == 'pipeline':
                bert_pipeline_queue_path = os.getenv('BERT_PIPELINE_QUEUE_PATH')
                bert_pipeline_priority_path = os.getenv('BERT_PIPELINE_PRIORITY_PATH')

                pipeline_queue = PipelineModules(
                    from_file=True,
                    file_path=bert_pipeline_queue_path,
                    device=device
                )

                pipeline_priority = PipelineModules(
                    from_file=True,
                    file_path=bert_pipeline_priority_path,
                    device=device
                )
            elif LOAD_MODE == 'model':
                bert_model_queue_path = os.getenv('BERT_TRANSFORMERS_MODEL_QUEUE_PATH')
                bert_model_priority_path = os.getenv('BERT_TRANSFORMERS_MODEL_PRIORITY_PATH')

                model_queue = TransformerModel(from_filte=True, load_path=bert_model_queue_path, device=device)
                model_priority = TransformerModel(from_file=True, load_path=bert_model_priority_path, device=device)

                pipeline_queue = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('classifier', model_queue)
                ])

                pipeline_priority = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('classifier', model_priority)
                ])

        elif email.model_choice.lower() == 'distilbert':
            if LOAD_MODE == 'pipeline':
                distilbert_pipeline_queue_path = os.getenv('DISTILBERT_PIPELINE_QUEUE_PATH')
                distilbert_pipeline_priority_path = os.getenv('DISTILBERT_PIPELINE_PRIORITY_PATH')

                pipeline_queue = PipelineModules(
                    from_file=True,
                    file_path=distilbert_pipeline_queue_path,
                    device=device
                )

                pipeline_priority = PipelineModules(
                    from_file=True,
                    file_path=distilbert_pipeline_priority_path,
                    device=device
                )

            elif LOAD_MODE == 'model':
                distilbert_model_queue_path = os.getenv('DISTILBERT_TRANSFORMERS_MODEL_QUEUE_PATH')
                distilbert_model_priority_path = os.getenv('DISTILBERT_TRANSFORMERS_MODEL_PRIORITY_PATH')

                model_queue = TransformerModel(model_path=distilbert_model_queue_path, device=device)
                model_priority = TransformerModel(model_path=distilbert_model_priority_path, device=device)

                pipeline_queue = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('classifier', model_queue)
                ])

                pipeline_priority = PipelineModules(steps=[
                    ('email_preprocessor', EmailPreprocessor()),
                    ('text_preprocessor', TextPreprocessor()),
                    ('classifier', model_priority)
                ])

        if FIT_FLG:
            pipeline_queue = PipelineModules(steps=[
                ('email_preprocessor', EmailPreprocessor()),
                ('text_preprocessor', TextPreprocessor()),
                ('label_preprocessor', LabelPreprocessor(label_column_name='queue')),
                ('classifier', TransformerModel(model='bert-base-uncased', device=device)) if email.model_choice.lower() == 'bert' else TransformerModel(model='distilbert-base-uncased', device=device)
            ])
            pipeline_queue.fit(data_prep_train_df)

            pipeline_priority = PipelineModules(steps=[
                ('email_preprocessor', EmailPreprocessor()),
                ('text_preprocessor', TextPreprocessor()),
                ('label_preprocessor', LabelPreprocessor(label_column_name='priority')),
                ('classifier', TransformerModel(model='bert-base-uncased', device=device)) if email.model_choice.lower() == 'bert' else TransformerModel(model='distilbert-base-uncased', device=device)
            ])
            pipeline_priority.fit(data_prep_train_df)

    else:
        raise HTTPException(status_code=400, detail="Invalid model choice")

    df_prep_email = df_email
    df_prep_email = pipeline_queue.transform(df_prep_email)
    df_prep_email = pipeline_queue.predict(df_prep_email)
    pred_queue = df_prep_email['prediction'].values[0]
    df_prep_email = pipeline_queue.predict_proba(df_prep_email)
    pred_queue_proba = df_prep_email['prediction_proba'].values[0]

    df_prep_email = df_email
    df_prep_email = pipeline_priority.transform(df_prep_email)
    df_prep_email = pipeline_priority.predict(df_prep_email)
    pred_priority = df_prep_email['prediction'].values[0]
    df_prep_email = pipeline_priority.predict_proba(df_prep_email)
    pred_priority_proba = df_prep_email['prediction_proba'].values[0]

    details = {
        "model": email.model_choice,
        "dataframe_shape": df_prep_email.shape,
        "predict_proba_queue": pred_queue_proba.tolist(),
        "predict_proba_priority": pred_priority_proba.tolist()
    }

    pred_queue_name = label_queue_values[pred_queue]
    pred_priority_name = label_priority_values[pred_priority]

    return PredictionResponse(queue=pred_queue_name, priority=pred_priority_name, details=details)

In [None]:
# !fastapi dev main.py

In [None]:
dict_payload = {
    "subject": "Meeting Reminder",
    "body": "Hello, the meeting is at 3 PM. Confirm attendance please.",
    "model_choice": "nb"
}
object_payload = EmailInput(**dict_payload)

response = predict(object_payload)

response

## UI

### File gradio_app.py

In [None]:
import os
import gradio as gr
import requests
# from src.utils.pydantic_models import EmailInput, PredictionResponse

# URL of the locally running API server
API_URL = os.getenv("API_URL", "http://localhost:8000/predict")

def predict_email(subject, body, model_choice):
    payload = {"subject": subject, "body": body, "model_choice": model_choice}
    response = requests.post(API_URL, json=payload)
    # object_payload = EmailInput(**dict_payload)
    # response = predict(object_payload)
    if response.status_code == 200:
        data = response.json()
        return f"Queue: {data['queue']}\nPriority: {data['priority']}\nDetails: {data['details']}"
    else:
        return f"Error: {response.text}"

# Create a Gradio interface
iface = gr.Interface(
    fn=predict_email,
    inputs=[
        gr.Textbox(label="Subject"),
        gr.Textbox(label="Body", lines=4),
        gr.Radio(choices=["nb", "lr", "distilbert", "bert"], label="Model Choice")
    ],
    outputs="text",
    title="Customer IT Support Email Classifier"
)

if __name__ == "__main__":
    iface.launch()

### File streamlit_app.py

In [None]:
import streamlit as st
import requests
import os

API_URL = os.getenv("API_URL", "http://localhost:8000/predict")

st.title("Customer IT Support Email Classifier")  # Matching Gradio's title

subject = st.text_input("Subject")  # Simplified label
body = st.text_area("Body", height=150)  # Adjust height as needed
model_choice = st.radio("Model Choice", options=["nb", "lr", "distilbert", "bert"])  # Using radio buttons

if st.button("Classify"):
    payload = {"subject": subject, "body": body, "model_choice": model_choice}
    response = requests.post(API_URL, json=payload)
    if response.status_code == 200:
        data = response.json()
        st.write(f"Queue: {data['queue']}\nPriority: {data['priority']}\nDetails: {data['details']}")  # Displaying all details
    else:
        st.error(f"Error: {response.text}")  # Displaying error details

In [None]:
!streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py

## Usage Examples

In [None]:
pipe_prep_filter_queue = Pipeline(steps=[
    ('email_preprocessor', EmailPreprocessor()),
    ('resampling_preprocessor', ResamplingPreprocessor(label_columns='queue', resample_mode=None))
])

pipe_prep_split_queue = Pipeline(steps=[
    ('splitter', SplitterPreprocessor())
])

pipe_prep_text_queue = Pipeline(steps=[
    ('text_preprocessor', TextPreprocessor())
])

pipe_prep_vectorize_text_queue = Pipeline(steps=[
    ('vectorizer', VectorizerPreprocessor())
])

pipe_prep_label_queue = Pipeline(steps=[
    ('label_preprocessor', LabelPreprocessor(label_column_name='queue'))
])

In [None]:
from data_examples import email_examples

data_df = pd.DataFrame(email_examples)

data_df

In [None]:
data_prep_df = data_df.copy()

data_prep_train_df, data_prep_val_df, data_prep_text_df = pipe_prep_split_queue.fit_transform(data_prep_df)

print('Train Dataset:')
display(data_prep_train_df.head())

print('Validation Dataset:')
display(data_prep_val_df.head())

print('Test Dataset:')
display(data_prep_text_df.head())

In [None]:
data_prep_train_df = pipe_prep_filter_queue.fit_transform(data_prep_train_df)

data_prep_train_df = pipe_prep_text_queue.fit_transform(data_prep_train_df)

data_prep_train_df = pipe_prep_label_queue.fit_transform(data_prep_train_df)

data_prep_train_df = pipe_prep_vectorize_text_queue.fit_transform(data_prep_train_df)

data_prep_train_df

In [None]:
pipeline_lr_queue = PipelineModules(steps=[
    ('email_preprocessor', EmailPreprocessor()),
    ('resampling_preprocessor', ResamplingPreprocessor(label_columns='queue', resample_mode=None)),
    ('text_preprocessor', TextPreprocessor()),
    ('vectorizer', VectorizerPreprocessor()),
    ('label_preprocessor', LabelPreprocessor(label_column_name='queue')),
    ('classifier', BaselineModel(model='LogisticRegression'))
])

display(pipeline_lr_queue)

splitter = SplitterPreprocessor(retrieve='all')

data_prep_train_df, data_prep_val_df, data_prep_text_df = splitter.fit_transform(data_prep_df)

data_prep_train_df = pipeline_lr_queue.fit(data_prep_train_df)

data_prep_train_df = pipeline_lr_queue.predict(data_prep_train_df)

metrics = pipeline_lr_queue.evaluate(data_prep_train_df)

print('Metrics on Train Dataset:', metrics)

data_prep_val_df = pipeline_lr_queue.transform(data_prep_val_df)

data_prep_val_df = pipeline_lr_queue.predict(data_prep_val_df)

print('Metrics on Validation Dataset:', metrics)

In [None]:
pipeline_nb_queue = PipelineModules(steps=[
    ('email_preprocessor', EmailPreprocessor()),
    ('resampling_preprocessor', ResamplingPreprocessor(label_columns='queue', resample_mode=None)),
    ('text_preprocessor', TextPreprocessor()),
    ('vectorizer', VectorizerPreprocessor()),
    ('label_preprocessor', LabelPreprocessor(label_column_name='queue')),
    ('classifier', BaselineModel(model='MultinomialNB'))
])

display(pipeline_nb_queue)

splitter = SplitterPreprocessor(retrieve='all')

data_prep_train_df, data_prep_val_df, data_prep_text_df = splitter.fit_transform(data_prep_df)

data_prep_train_df = pipeline_nb_queue.fit(data_prep_train_df)

data_prep_train_df = pipeline_nb_queue.predict(data_prep_train_df)

metrics = pipeline_nb_queue

print('Metrics on Train Dataset:', metrics)

data_prep_val_df = pipeline_nb_queue.transform(data_prep_val_df)

data_prep_val_df = pipeline_nb_queue.predict(data_prep_val_df)

print('Metrics on Validation Dataset:', metrics)

## Poetry

### File pyproject.toml

    [tool.poetry]
    name = "customer_it_support"
    version = "0.1.0"
    description = "Email classification system for customer IT support using ML and Transformers"
    authors = ["Your Name <your.email@example.com>"]
    license = "MIT"
    readme = "README.md"
    packages = [{include = "src"}]

    [tool.poetry.dependencies]
    python = ">=3.11.6,<3.12"

    # Backend (API & ML)
    fastapi = "^0.115.11"
    uvicorn = "^0.34.0"
    torch = "^2.6.0"
    transformers = "^4.49.0"
    datasets = "^3.3.2"
    scikit-learn = "^1.6.1"
    nltk = "^3.9.1"
    spacy = "^3.8.4"
    wordcloud = "^1.9.4"
    wandb = "^0.19.8"
    accelerate = "^1.4.0"

    # Data Processing & Visualization
    pandas = "^2.2.3"
    numpy = "^2.2.3"
    matplotlib = "^3.10.1"
    seaborn = "^0.13.2"

    # UI Apps
    gradio = "^5.20.0"
    streamlit = "^1.43.0"

    # Environment & Config
    python-dotenv = "^1.0.1"
    ipywidgets = "^8.1.5"

    [tool.poetry.dev-dependencies]
    pytest = "^7.0"
    black = "^24.0"
    flake8 = "^6.0"
    mypy = "^1.0"

    [tool.poetry.scripts]
    start-api = "uvicorn src.api.main:app --host 0.0.0.0 --port 8000 --reload"
    start-gradio = "python src/ui/gradio_app.py"
    start-streamlit = "streamlit run src/ui/streamlit_app.py"
    start-all = "bash run_all.sh"

    [build-system]
    requires = ["poetry-core"]
    build-backend = "poetry.core.masonry.api"

## Bash

### File run_all.sh

    #!/bin/bash
    poetry run uvicorn src.api.main:app --host 0.0.0.0 --port 8000 --reload &
    poetry run python src/ui/gradio_app.py &
    poetry run streamlit run src/ui/streamlit_app.py

    chmod +x run_all.sh

    # poetry run start-api
    # poetry run start-gradio
    # poetry run start-streamlit
    poetry run start-all

References

- [Getting Started with scikit-learn Pipelines for Machine Learning](https://medium.com/analytics-vidhya/getting-started-with-scikit-learn-pipelines-for-machine-learning-fa88efdca3b9)