In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
complaints = pd.read_pickle('../data/complaints_df.pkl')
complaints.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353432 entries, 0 to 353431
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   complaint        353432 non-null  object 
 1   category         353432 non-null  object 
 2   word_count       353432 non-null  int64  
 3   char_count       353432 non-null  int64  
 4   avg_word_len     353432 non-null  float64
 5   complaint_clean  353432 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 16.2+ MB


In [3]:
complaints.loc[complaints['word_count'] < 5]

Unnamed: 0,complaint,category,word_count,char_count,avg_word_len,complaint_clean
109,Not my debt,Attempts to collect debt not owed,3,11,3.666667,Not my debt
304,Report inaccurate information,Incorrect information on your report,3,29,9.666667,Report inaccurate information
2653,Not my information reporting.,Incorrect information on your report,4,29,7.250000,Not my information reporting.
2707,Not my accounts,Incorrect information on your report,3,15,5.000000,Not my accounts
2781,called company XX/XX/,Attempts to collect debt not owed,3,21,7.000000,called company
...,...,...,...,...,...,...
352600,XX/XX/2018 amount owed XXXX,Attempts to collect debt not owed,4,27,6.750000,2018 amount owed
352978,Account has been paid,Attempts to collect debt not owed,4,21,5.250000,Account has been paid
353064,This is not mine,Attempts to collect debt not owed,4,16,4.000000,This is not mine
353353,NO ANSWER EVER,Attempts to collect debt not owed,3,14,4.666667,NO ANSWER EVER


In [4]:
complaints['category_id'] = complaints['category'].factorize()[0]

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=500, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(complaints['complaint']).toarray()
labels = complaints['category_id']
features.shape

(353432, 9117)

In [6]:
category_id_df = complaints[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)

In [9]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for category, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(category))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Attempts to collect debt not owed':
  . Most correlated unigrams:
. collection
. debt
  . Most correlated bigrams:
. collection agency
. collect debt
# 'Communication tactics':
  . Most correlated unigrams:
. calls
. calling
  . Most correlated bigrams:
. stop calling
. times day
# 'Fraud or scam':
  . Most correlated unigrams:
. paypal
. coinbase
  . Most correlated bigrams:
. sent money
. cash app
# 'Incorrect information on your report':
  . Most correlated unigrams:
. calling
. debt
  . Most correlated bigrams:
. times day
. collect debt
# 'Struggling to pay mortgage':
  . Most correlated unigrams:
. foreclosure
. modification
  . Most correlated bigrams:
. short sale
. loan modification
