# Multiclass Text Classification Development

Purpose:
This model predicts a Company's business category based on the text of their homepage website. 

Hypothesis: 
The implicit hypothesis is that websites within each category will use distinctive language that can be used to classify them.

Overall process:
1. Normalize Text (done during eda.ipynb to complete EDA)
2. Label Encoding
3. Feature Extraction (TFIDF & BERT)
4. Model Training
5. Evaulate best performing model and vectorization method

In [1]:
import pandas as pd
import os 

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

from transformers import BertTokenizer, BertModel
import torch

from sklearn.base import TransformerMixin, BaseEstimator

# read in data

In [2]:
# read data back in from pickle file created with eda.ipynb

# Dynamically get the current working directory
current_dir = os.getcwd()
text_path = os.path.abspath(os.path.join(current_dir, '..', 'output','combined_data.pkl'))

# read data back in 
df_clean = pd.read_pickle(text_path)
df_clean.head()

# Label Encoding

In [4]:
#Turning the labels into numbers
label_encoder = LabelEncoder()
df_clean['Category_encoded'] = label_encoder.fit_transform(df_clean['Category'])
print(df_clean['Category'].unique())
print(df_clean['Category_encoded'].unique())

['Corporate Services' 'Media, Marketing & Sales' 'Healthcare'
 'Industrials' 'Commercial Services & Supplies' 'Consumer Discretionary'
 'Transportation & Logistics' 'Energy & Utilities' 'Financials'
 'Professional Services' 'Consumer Staples' 'Materials'
 'Information Technology']
[ 3 10  6  7  0  1 12  4  5 11  2  9  8]


# Feature Extraction

I'm going to use k-fold cross-validation to evaluate my models later on. 

In [5]:
# split the data into features (X) and labels (y)
X = df_clean['clean_text_str']
y = df_clean['Category_encoded']

print (X.shape)
print(y.shape)

(71415,)
(71415,)


In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define KFold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=42) # using the normal 10 folds

In [None]:
# Define the classification models to be tested
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(multi_class='ovr', max_iter=1000),
    'Support Vector Classifier': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

## Notes:

I've chosen the following models to test: 
1. Naive Bayes
    - This model is extremely fast and in production can be used as an 'online' model (i.e. can be updated in real time)
    - MultinomialNB is usually very good with discrete features like word counts 
    - Can always improve this by adding nontext features
2. Logistic Regression
3. Support Vector Classifier
4. Decision Tree
5. Random Forest

## vectorization with TF-IDF

In [10]:
### 1. TF-IDF Pipeline
tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),  # TF-IDF Vectorization
])

## BERT embeddings

In [11]:
### 2. BERT Embeddings Pipeline (Custom Transformer)
class BERTEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased', max_length=128):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.max_length = max_length

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        embeddings = []
        for sentence in X:
            tokens = self.tokenizer(sentence, padding='max_length', truncation=True, return_tensors='pt', max_length=self.max_length)
            with torch.no_grad():
                outputs = self.model(**tokens)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        return embeddings

bert_pipeline = Pipeline([
    ('bert', BERTEmbeddingTransformer()),  # BERT Embeddings
])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# Model Evaluation

In [None]:
# Initialize lists to store results for comparison
results = []

# Iterate over the models and compare pipelines (TF-IDF vs BERT embeddings)
for model_name, model in models.items():
    
    ### TF-IDF Pipeline
    tfidf_model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
        (model_name, model)
    ])

    # Cross-Validation for TF-IDF
    tfidf_scores = cross_val_score(tfidf_model_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    
    # Train on TF-IDF
    tfidf_model_pipeline.fit(X_train, y_train)
    y_pred_tfidf = tfidf_model_pipeline.predict(X_test)
    tfidf_accuracy = accuracy_score(y_test, y_pred_tfidf)
    tfidf_report = classification_report(y_test, y_pred_tfidf, target_names=label_encoder.classes_, output_dict=True)

    # Store TF-IDF results
    results.append({
        'Model': model_name,
        'Pipeline': 'TF-IDF',
        'Cross_Val_Accuracy': tfidf_scores.mean(),
        'Test_Accuracy': tfidf_accuracy,
        'Precision': tfidf_report['weighted avg']['precision'],
        'Recall': tfidf_report['weighted avg']['recall'],
        'F1-Score': tfidf_report['weighted avg']['f1-score']
    })
    
    ### BERT Pipeline
    bert_model_pipeline = Pipeline([
        ('bert', BERTEmbeddingTransformer()),  # BERT Embeddings
        (model_name, model)
    ])

    # Cross-Validation for BERT
    bert_scores = cross_val_score(bert_model_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    
    # Train on BERT
    bert_model_pipeline.fit(X_train, y_train)
    y_pred_bert = bert_model_pipeline.predict(X_test)
    bert_accuracy = accuracy_score(y_test, y_pred_bert)
    bert_report = classification_report(y_test, y_pred_bert, target_names=label_encoder.classes_, output_dict=True)

    # Store BERT results
    results.append({
        'Model': model_name,
        'Pipeline': 'BERT',
        'Cross_Val_Accuracy': bert_scores.mean(),
        'Test_Accuracy': bert_accuracy,
        'Precision': bert_report['weighted avg']['precision'],
        'Recall': bert_report['weighted avg']['recall'],
        'F1-Score': bert_report['weighted avg']['f1-score']
    })

# Create DataFrame for all the results
results_df = pd.DataFrame(results)

# Display results for comparison
print(results_df)


In [1]:
# initialize list to store results for comparison
results = []
# Iterate over the models and compare pipelines (TF-IDF vs BERT embeddings)
for model_name, model in models.items():
    print(f"\n=== {model_name} ===")
    
    ### TF-IDF Pipeline
    tfidf_model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),
        (model_name, model)
    ])

    # Cross-Validation for TF-IDF
    tfidf_scores = cross_val_score(tfidf_model_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    print(f"TF-IDF {model_name} Cross-Validation Accuracy: {tfidf_scores.mean():.4f}")
    
    # Train on TF-IDF
    tfidf_model_pipeline.fit(X_train, y_train)
    y_pred_tfidf = tfidf_model_pipeline.predict(X_test)
    print(f"TF-IDF {model_name} Confusion Matrix of Category Performance:")
    print(classification_report(y_test, y_pred_tfidf, target_names=label_encoder.classes_))

    ### BERT Pipeline
    bert_model_pipeline = Pipeline([
        ('bert', BERTEmbeddingTransformer()),  # BERT Embeddings
        (model_name, model)
    ])

      # Cross-Validation for BERT
    bert_scores = cross_val_score(bert_model_pipeline, X_train, y_train, cv=kf, scoring='accuracy')
    print(f"BERT {model_name} Cross-Validation Accuracy: {bert_scores.mean():.4f}")
    
    # Train on BERT
    bert_model_pipeline.fit(X_train, y_train)
    y_pred_bert = bert_model_pipeline.predict(X_test)
    print(f"BERT {model_name} Confusion Matrix of Category Performance:")
    print(classification_report(y_test, y_pred_bert, target_names=label_encoder.classes_))


NameError: name 'models' is not defined