In [None]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [None]:
complaints = pd.read_pickle('../data/complaints_df.pkl')
complaints.info()

In [None]:
complaints['category_id'] = complaints['category'].factorize()[0]

In [None]:
category_id_df = complaints[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)

## Set up and fit model

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=1250, 
                        norm='l2',
                        encoding='latin-1', 
                        ngram_range=(1, 2), 
                        stop_words='english')

features = tfidf.fit_transform(complaints['complaint_clean']).toarray()
labels = complaints['category_id']
features.shape

In [None]:
model = LogisticRegression(random_state=369, max_iter=1000)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, 
                                                                                 labels, 
                                                                                 complaints.index, 
                                                                                 test_size=0.2,
                                                                                 random_state=369)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(metrics.classification_report(y_test, y_pred, target_names=complaints['category'].unique()))

In [None]:
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
conf_mat = confusion_matrix(y_test, y_pred, normalize = 'true')
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, #fmt='d',
            xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
import plotly.express as px

In [None]:
category_id_df['category'] =(
    category_id_df['category']
    .str.replace('Incorrect information on your report', 'Incorrect information')
    .str.replace('Attempts to collect debt not owed', 'Debt not owed')
    .str.replace('Struggling to pay mortgage', 'Mortgage struggles')
)

In [None]:
df = confusion_matrix(y_test, y_pred, normalize = 'true')

fig = px.imshow(df,
                x=category_id_df.category.values,
                y=category_id_df.category.values,
                labels={
                'x' : 'Predicted',
                'y' : 'Actual',
                'color' : 'Proportion of Actual'
                },
                color_continuous_scale='Peach',
                text_auto=True)

fig.update_traces(texttemplate="%{z:.2%}", 
                  zhoverformat=".2%",
                  colorbar_len=0.9,
                  selector=dict(type='heatmap'))


fig.show()

In [None]:
#fig.write_html('../images/logreg_confmat.html')