In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
n_gram = 1

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,n_gram) )),
    ('rf', RandomForestClassifier())
])

# how can i see what the tokenizer is doing?
tokenized = pipeline.named_steps['tfidf'].fit_transform(X)
# show tokenized as text
tokenized_text = pipeline.named_steps['tfidf'].inverse_transform(tokenized)
print(tokenized_text[:110])
pipeline.fit(X, y)

# Get feature names from TfidfVectorizer
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()

# Get feature importances by class
importances_by_class = {}
for class_index, class_label in enumerate(pipeline.named_steps['rf'].classes_):
    print('Class {}'.format(class_label))
    # Binary classification: one class vs rest
    y_binary = (y == class_label)
    rf_binary = RandomForestClassifier(oob_score=classification_report)
    rf_binary.fit(pipeline.named_steps['tfidf'].transform(X), y_binary)
    importances_by_class[class_label] = dict(
        sorted(zip(feature_names, rf_binary.feature_importances_), key=lambda x: x[1], reverse=True))

# Print feature importances by class
for class_label, importances in importances_by_class.items():
    print(f"\nClass: {class_label}")
    for feature_name, importance in list(importances.items())[:25]:
        print(f"{feature_name}: {importance}")

# report classification score on the oob set
print(f"oob score: {rf_binary.oob_score_}")   

In [2]:
class TextStats(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.array([
            [
                len(text.split()),                   # word count
                len(text),                           # char count
                sum(c in string.punctuation for c in text),  # punctuation count
                sum(1 for c in text if c.isupper()) / (len(text) + 1e-5)  # uppercase ratio
            ]
            for text in X
        ])

In [3]:
df = pd.read_excel(f'excel tables/Classifier_text_with_categories.xlsx')

X_train, X_test, y_train, y_test = train_test_split(df["text"], df["category"], test_size=0.3, random_state=42)


pipeline = Pipeline([
    ('features', FeatureUnion([
        ('tfidf', TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000)),
        ('textstats', TextStats())
    ])),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

                      precision    recall  f1-score   support

    1. Title/Caption       0.95      0.98      0.96       307
2. Short Explanation       0.74      0.57      0.65        40
        3. Full Text       0.93      0.87      0.90        15

            accuracy                           0.93       362
           macro avg       0.87      0.81      0.84       362
        weighted avg       0.93      0.93      0.93       362

