Imports

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from tqdm import tqdm
import plotly.figure_factory as ff
import plotly.io as pio
import plotly.graph_objects as go

importing pre-processed pickle

In [2]:
# Load the entire .pkl file into a DataFrame
complaints = pd.read_pickle('../data/complaints_nlp.pkl')

# Select features
columns_to_import = ['issue', 'tokens', 'stems', 'lemmas']
complaints = complaints[columns_to_import]

In [4]:
complaints.loc[250,'lemmas']

['feel',
 'behind',
 'tried',
 'get',
 'emailed',
 'faxed',
 'many',
 'time',
 'still',
 'able',
 'get',
 'done',
 'made',
 'many',
 'phone',
 'call',
 'still',
 'said',
 'time',
 'manager',
 'call',
 'back',
 'never',
 'email',
 'paper',
 'work',
 'afraid',
 'loosing',
 'home',
 'need',
 'explained',
 'still',
 'one',
 'reached',
 'back',
 'mei',
 'copy',
 'email',
 'please',
 'help']

The models described below use a "bag of words" approach.

-----------------
### Token Models
> Not Fruitful.  Focus on Lemmatised and Stemmed.  **Should I even be doing this on the tokens!?!?!?** I expect that there is an issue with the size of the matrices that I'm working with.  Perhaps focusing on dimension reduction is where I need to go.
_____________________

#### BOW Tokens: - Count Vectorizer

In [None]:
# Create an instance of CountVectorizer for tokens

vectorizer_tokens = CountVectorizer(min_df=2)
X_tokens = vectorizer_tokens.fit_transform(complaints['tokens'].apply(lambda x: ' '.join(x)))

# Create a DataFrame from the sparse matrix

bag_of_words_tokens_df = pd.DataFrame.sparse.from_spmatrix(X_tokens, columns=vectorizer_tokens.get_feature_names_out())

# Concatenate the bag-of-words DataFrame with the 'issue' column

bag_of_words_tokens_df = pd.concat([complaints['issue'], bag_of_words_tokens_df], axis=1)

#### BOW Tokens: Train/Test Split

In [None]:
X = bag_of_words_tokens_df.drop('issue', axis=1)
y = bag_of_words_tokens_df['issue']
X_train, X_test, y_train, y_test = tqdm(train_test_split(
    X, y, test_size=0.2, random_state=42), total=1, leave=False)

#### BOW Tokens: Naive Bayes Model

In [None]:
# Create an instance of the Naive Bayes model
nb_model = MultinomialNB()

#Train The the model
nb_model.fit(X_train, y_train)

# Make Predictions from model
y_pred = nb_model.predict(X_test)

It keeps yelling at me for having a sparse matrix when fitting the model . . . When I try to go dense, It crashes the kernel.  I even tried to chunk the proccess, but no joy.  I thought I needed a sparse matrix.  Dense is several gb in allocation.  If I had hair, I'd pull it :-(

---------------
### Stemm Models
_____________

#### BOW Stemms: CountVectorizer Analysis

In [None]:
# Create an instance of CountVectorizer for stems
vectorizer_stems = CountVectorizer(min_df=3)
X_stems = vectorizer_stems.fit_transform(complaints['stems'].apply(lambda x: ' '.join(x)))

# Create a DataFrame from the sparse matrix
bow_stems = pd.DataFrame.sparse.from_spmatrix(X_stems, columns=vectorizer_stems.get_feature_names_out())

# Concatenate the bag-of-words DataFrame with the 'issue' column
bow_stems = pd.concat([complaints['issue'], bow_stems], axis=1)

print("Bag-of-Words Stems:")
print(bow_stems.head())

With no limit placed on word frequency, the stems vectorizer produces a matrix of **75,905** dimensions.  
- With a minimum word occurance of 2, dimensions were reduced to **36,811**.  This means: 
    - There are **39,094** word stems which occur only once.
    - Single occurrance stems represent **51.50%** of the dimensions that occur in the matrix.
    - **This may also indicate that tokens derived from the corpus require additional processing before stemming so that more words may be salvaged.** 
- With a minimum word occurrance of 3, the dimensions were reduced to **26,067**.  This means that
    - There was a **29.19%** dimension reduction from `min_df=2`.
    - **34.34%** of stems lemmas occur more than three times.
    - **65.66%** of stems occur 3 or fewer times.

#### BOW Stems: TF-IDF Analysis

In [None]:
# Create an instance of TfidfVectorizer:
tfidf = TfidfVectorizer(min_df=1)

# Fit and transform the stemmed text data using TfidfVectorizer:
X_stems = tfidf.fit_transform(complaints['stems'].apply(lambda x: ' '.join(x)))

# Create a DataFrame from the sparse matrix:
tfidf_stems = pd.DataFrame.sparse.from_spmatrix(X_stems, columns=tfidf.get_feature_names_out())

# Concatenate the TF-IDF DataFrame with the 'issue' column:
tfidf_stems = pd.concat([complaints['issue'], tfidf_stems], axis=1)

# Print the TF-IDF representation:
print("TF-IDF Stems:")
print(tfidf_stems.head())

Applying TF-IDF to the stemms seems to have no effect on reducing dimensionality.  Again, do I need more preprocessing, or am I missing something else along the way?

#### BOW Tokens - Train/Test Split (include later if neccessary)

--------------
### Lemma Models
____________

With no limit placed on word frequency, the **lemmas CountVectorizer** produces a matrix of **94,020** dimensions.  This means that the `lemma` matrix has **18,115** more dimensions than the `stemms` matrix in its current state.  
- With a minimum word occurance of 2, dimensions were reduced to **47,258**.  This means: 
    - There are **46,762** lemmas which occur only once.
    - Single occurrance lemmas represent **50.26%** of the dimensions that occur in the matrix. 
- With a minimum word occurrance of 3, the dimensions were reduced to **34,220**.  This means that:
    - There was a **27.59%** dimension reduction from `min_df=2`.
    - **36.40%** of word stems occur 3 or more times.
    - **63.60%** of the stems occur fewer than 3 times.

In [5]:
# Define X & y
X = complaints['lemmas'].apply(lambda x: ' '.join(x))
y = complaints['issue']  # Target variable (issue category)

# Train/Test Split for Lemmas
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

#### BOW Lemmas: TF-IDF with LinearSVC

In [6]:
# Set Min Word occurance
occurance = 2

# Instantiate TfidfVectorizer:
tfidf = TfidfVectorizer(min_df = occurance)

# Fit and transform using TfidfVectorizer:
X_train_lemmas_tfidf = tfidf.fit_transform(X_train)
X_test_lemmas_tfidf = tfidf.transform(X_test)

# Instantiate LinearSVC model
model = LinearSVC()

# Train
model.fit(X_train_lemmas_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_lemmas_tfidf)

# Classification report
report = classification_report(y_test, y_pred)
print("Consumer Complaint Classification using TF-IDF Lemmas with LinearSVC")
print(report)

Consumer Complaint Classification using TF-IDF Lemmas with LinearSVC
              precision    recall  f1-score   support

           0       0.76      0.66      0.71     18291
           1       0.84      0.76      0.80      5311
           2       0.90      0.86      0.88      3087
           3       0.90      0.95      0.92     57326
           4       0.93      0.91      0.92      4343

    accuracy                           0.87     88358
   macro avg       0.87      0.83      0.85     88358
weighted avg       0.87      0.87      0.87     88358



In [None]:
# Confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

# Class labels
class_labels = sorted(set(y_test))

# Reverse the order of rows in the confusion matrix
confusion_mat = confusion_mat[::-1]

# Create a heatmap figure using Plotly
fig = ff.create_annotated_heatmap(
    z=confusion_mat,
    x=class_labels,
    y=class_labels[::-1],  # Reverse the order of y-axis labels
    colorscale='Blues',
    showscale=True,
)

# Update the figure layout
fig.update_layout(
    title='Confusion Matrix: TF-IDF Lemmas with LinearSVC',
    xaxis=dict(title='Predicted Labels'),
    yaxis=dict(title='True Labels'),
)

# Display the figure
pio.show(fig)

#### BOW Lemmas: TF-IDF with XGBoost

In [None]:
# Set Min Word occurrence
occurrence = 2

# Instantiate TfidfVectorizer:
tfidf = TfidfVectorizer(min_df=occurrence)

# Fit and transform using TfidfVectorizer:
X_train_lemmas_tfidf = tfidf.fit_transform(X_train)
X_test_lemmas_tfidf = tfidf.transform(X_test)

# Instantiate XGBoost classifier
model = XGBClassifier()

# Train
model.fit(X_train_lemmas_tfidf, y_train)

# Predict
y_pred = model.predict(X_test_lemmas_tfidf)

# Classification report
report = classification_report(y_test, y_pred)
print("Consumer Complaint Classification using TF-IDF Lemmas with XGBoost")
print(report)