In [None]:
import re

words_to_remove = ['gargamella', 'doggy', 'word3']
words_pattern = r'\b(?:' + '|'.join(map(re.escape, words_to_remove)) + r')\b'

patterns = {
            r"[a-zA-Z]+n\'t": 'not',
            r'\s+': ' ',             # Substitute multiple spaces with a single space
            r'\d+': '',              # Remove digits
            r'[^\w\s]': '',          # Remove punctuation and symbols
            r'\b\w{1,2}\b': '',      # Remove all tokens less than 2 characters
            r'(http|www)[^\s]+': '', # Remove websites
            words_pattern: ''        # Remove specific words
            }

def clean_column(df, column, patterns):
    df[column] = df[column].str.lower() # Apply lowercase transformation
    for pattern, replacement in patterns.items():
        df[column] = df[column].str.replace(pattern, replacement, regex=True)
    return df

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, patterns):
        self.patterns = patterns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X, columns=['text'])
        df = clean_column(df, 'text', self.patterns)
        return df['text'].values

In [None]:
text_cleaner = TextCleaner(patterns)
bow = CountVectorizer(max_features=15000,stop_words="english")
tfidf = TfidfTransformer()
clf = MultinomialNB(alpha=0.1)

model_pipe = Pipeline([
                    ('text cleaner', text_cleaner),  ## Regex and clean
                    ('bow', bow),                    ## BoW
                    ('tfidf', tfidf),                ## TF-IDF
                    ('classifier', clf),             ## Classifier
                    ])