In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
import json
import string
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from gensim.corpora import Dictionary
import random


def get_sample(data, size):
    '''
    Takes a dictionary and a sample size and pulls randomly from the dictionary.
    '''
    key_list = [i for i in data.keys()]
    sample_keys = random.sample(key_list, size)
    #eval_keys = [i for i in key_list if i not in sample_keys]

    return sample_keys#, eval_keys


def preprocessing(data, n):
    '''
    Performs preprocessing tasks on raw data.
    '''
    punctuation = str.maketrans('', '', string.punctuation)
    stop_words = set(stopwords.words('english'))
    wnl = WordNetLemmatizer()
    rv = {}
    bad_ids = []
    for id, narrative in data.items():
        # Preprocessing tasks
        tokens = word_tokenize(narrative.lower()) # lower case & tokenize
        stripped = [w.translate(punctuation) for w in tokens] # strip punctuation
        words = [word for word in stripped if word.isalnum()] # remove non-alphabetic-numeric words
        words = [w for w in words if not w in stop_words] # remove stopwords
        words = [w if w  != len(w) * w[0] else '~' for w in words] # replace 'xxx' with '~' 
        words = [w if not w.isdigit() else '*' for w in words] # replace numbers with '*'
        words = [wnl.lemmatize(w) for w in words] # lemmatize

        # Remove consecutive duplicates of '~' or '*'
        final_words = []
        for i in range(0,len(words)):
          if i==0:
            final_words.append(words[i])
          else:
            if not((words[i] == '*' and final_words[-1]=='*'
                    ) or (
                        words[i] == '~' and final_words[-1]=='~')):
                final_words.append(words[i])

        # Create ngrams
        try:
          ngram_tuples = list(ngrams(final_words, n))
          ngram_words = [' '.join(ngram_tuple) for ngram_tuple in ngram_tuples]

          # Replace narratives in data with preprocessed text
          rv[id] = ngram_words
        except:
          bad_ids.append(id)
    
    return rv


def embedding(data, n):
    '''
    Creates word embeddings.
    '''
    data = preprocessing(data, n)
    docs = list(data.values())
    dictionary = Dictionary(docs) 

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5) 

    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return data, docs, dictionary, corpus

In [14]:
g = open('products.json')
complaint_types = json.load(g)
h = open('complaints_full.json')
narratives = json.load(h)
sample_keys = get_sample(narratives,500000)
sample_data = {i: narratives[i] for i in sample_keys}
sample_data[sample_keys[0]]

'My daughter took out XXXX student loads, which I cosigned. She apparently stopped paying the monthly payments sometime in XXXX of 2015. I was unaware of this until I received a phone call in late XXXX from Wells Fargo, the debt holder. I asked for a statement, and paid the bill when it was received. I then received numerous calls over a period of several weeks and in each instance I asked for a written statement. Each of these statements FALSELY state that I had received numerous prior notifications. That is simply not true. I recieved no prior notification whatsoever. I paid all of these bills when notified, and yet Wells Fargo has now notified all XXXX credit agencies that I am in arrears, which has had a devastating impact on my credit rating. I have always had perfect credit. I acted in good faith to fulfill my obligation as a co signer. Wells Fargo has acted in bad faith, has improperly damaged my credit, and has repeatedly failed to timely notify me of the debt owed, and has ref

In [18]:
eval_keys = list(set(narratives.keys())-set(sample_keys))
eval_data = {i: narratives[i] for i in eval_keys}

In [21]:
NGRAMS = 2
full_data, full_docs, full_dictionary, full_corpus = embedding(sample_data, NGRAMS)
full_temp = full_dictionary[0] 
full_id2word = full_dictionary.id2token

In [23]:
from gensim.models import LdaModel

full_model = LdaModel(
    corpus=full_corpus,
    id2word=full_id2word,
    num_topics=15,
    chunksize=1000,
    passes=30,
    iterations=25,
    random_state=1
)

full_model.save('full_lda.model')

In [25]:
# Compute coherence
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=full_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model.get_coherence()
print('Coherence Score: ', coherence_lda)

# Compute perplexity
print('Perplexity Score: ', full_model.log_perplexity(corpus))

Coherence Score:  0.6363205271677349
Perplexity Score:  -11.932547715701617


In [64]:
key_list = [i for i in narratives.keys()][0:100000]
eval_keys = narratives
testing_eval_ngrams, bad_ids = preprocessing(eval_keys,2)

In [28]:
eval_processed = preprocessing(eval_data,2)

In [29]:
from pprint import pprint
top_topics = full_model.top_topics(corpus)
pprint(top_topics)

[([(0.04681514, 'credit report'),
   (0.010670912, 'identity theft'),
   (0.008924044, 'credit file'),
   (0.008036387, 'social security'),
   (0.007919517, 'inquiry ~'),
   (0.0078012375, 'credit bureau'),
   (0.007671249, 'police report'),
   (0.007648646, 'report ~'),
   (0.007219863, '~ inquiry'),
   (0.0056640827, '~ credit'),
   (0.0055721123, 'fraudulent account'),
   (0.005496895, 'personal information'),
   (0.0054759914, 'date ~'),
   (0.005187177, 'security number'),
   (0.00476239, 'name ~'),
   (0.004714155, 'victim identity'),
   (0.004392227, 'address ~'),
   (0.0040252055, 'information credit'),
   (0.0037250747, 'credit reporting'),
   (0.0036767754, 'account credit')],
  -13.331141400641402),
 ([(0.010056033, 'payment ~'),
   (0.009178184, 'student loan'),
   (0.008960567, '~ payment'),
   (0.00834574, 'late fee'),
   (0.0077766506, '~ month'),
   (0.007463246, 'payment *'),
   (0.0073678605, 'loan ~'),
   (0.0067823953, 'interest rate'),
   (0.0066187675, 'monthly pa

In [37]:
topic_map = {}
for topic_id in range(full_model.num_topics):
     topk = full_model.show_topic(topic_id, 10)
     topk_words = [ w for w, _ in topk ]
     topic_map[topic_id] = ', '.join(topk_words)
     #print('{}: {}'.format(topic_id, ', '.join(topk_words)))
topic_map

{0: 'credit card, bank america, checking account, account ~, bank account, ~ *, card ~, * ~, debit card, ~ account',
 1: '* day, sent letter, credit report, credit bureau, letter ~, certified mail, letter sent, within *, received response, certified letter',
 2: '* ~, section *, credit reporting, * usc, collection practice, debt collection, fair debt, reporting act, practice act, fair credit',
 3: 'collection agency, never received, debt ~, collect debt, debt collection, debt collector, collection company, ~ debt, original creditor, amount *',
 4: 'customer service, ~ called, called ~, ~ told, * day, call back, told would, said would, spoke ~, ~ said',
 5: 'reporting agency, consumer reporting, identity theft, ~ oh, oh ~, theft ~, consumer ~, theft report, file consumer, consumer report',
 6: '~ received, ~ *, dated ~, number ~, ~ sent, complaint ~, ~ also, sent ~, email ~, received ~',
 7: 'well fargo, hard inquiry, credit report, inquiry credit, ~ well, would like, ~ xxxx2019, xxxx20

In [39]:
import pandas as pd
check_df = pd.DataFrame()
df_list = []
track = 0
for k, v in eval_processed.items():
  vec = dictionary.doc2bow(v)
  tmp_df = pd.DataFrame(full_model[vec])
  tmp_df['id'] = k
  tmp_df['complaint_type'] = complaint_types[k]
  check_df = pd.concat([check_df,tmp_df])
  track+=1
  if not track%50000:
    print(track)
    df_list.append(check_df.copy())
    check_df = pd.DataFrame()

50000
100000
150000
200000


In [40]:
#check_df.head()
df_list.append(check_df.copy())
check_final = pd.concat(df_list)
check_final['derived_topic'] = check_final[0].map(topic_map)


In [43]:
check_final.shape

(2289587, 5)

In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
check_final.to_csv('drive/My Drive/advml_proj.csv')