## Imports and Utility Functions

In [21]:
import pandas as pd
import numpy as np
from collections import Counter

import nltk
from nltk.corpus import stopwords, brown
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

import lda

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def open_corpus(file):
    with open(file) as f:
        raw_text = f.read()
    return raw_text

def tokenize_sents(raw_text):
    raw_sents = nltk.sent_tokenize(raw_text)
    return [nltk.word_tokenize(word) for word in raw_sents]

def tokenize_words(raw_text):
    return nltk.word_tokenize(raw_text)

def create_data_sets():
    tagged_sents = brown.tagged_sents(categories='news')
    size = int(len(tagged_sents) * 0.9)
    train_sents = tagged_sents[:size]
    test_sents = tagged_sents[size:]
    return train_sents, test_sents

def create_backoff_tagger(): 
    train_sents, test_sents = create_data_sets()
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2

## Load from CSV to Pandas DataFrame

In [3]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="latin-1", index_col="AutoID")
dataset.head()

Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8/26/2015,2015,8,twitter,3 ways the internet of things will change Bank...
2,8/5/2015,2015,8,twitter,BankB BankB Name downgrades apple stock to neu...
3,8/12/2015,2015,8,twitter,BankB returns to profit on INTERNET/! board2? ...
4,8/5/2015,2015,8,twitter,BankB tells advisers to exit paulson hedge fun...
5,8/12/2015,2015,8,twitter,BankC may plead guilty over foreign exchange p...


We opted to focus primarily on the Facebook data for a few reasons:
* It covers a longer time period, 12 months versus one month
* The messages were longer and more resembled coherent, english thoughts
* We did not have the ability to find Twitter messages that spanned across several tweets by a single user

##Filter to Facebook messages only


In [4]:
facebook = dataset[dataset['MediaType'] == 'facebook']

## Identify stopwords and remove irrelevant messages


In [5]:
facebook.head()

Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56,9/30/2014,2014,9,facebook,laude Name BankB in the line up when the# fly...
65,1/9/2015,2015,1,facebook,- any body banking with BankC bank bank tryna ...
66,8/19/2014,2014,8,facebook,- bitches be thoting off Name Name me Name Nam...
67,10/12/2014,2014,10,facebook,- center for aligned healing special announcem...
69,8/20/2014,2014,8,facebook,- facebook wife wife?   :- *      ¥ ...


In [6]:
facebook.shape

(83647, 5)

After examining the most commonly occurring tokens in the messages with a very basic FreqDist, we identified two lists of Stop Words. The first, filter_stop, consisting of words that had been added to dataset by Wells Fargo as masks. The second, addl_stop, were words and characters we identified as occurring with very high frquency in messages that were of no analytical value.

In [7]:
filter_stop = {'Name', 'INTERNET', 'BankA', 'BankB', 'BankC', 'BankD',
              'BankAs', 'BankBs', 'BankCs', 'BankDs','bankd', 'bankds',
              'internet', 'bankbs', 'bankb', 'bankcs', 'name', 'banka',
              'bankc', 'bankas'}
# additional stopwords derived from examining 100 most common tokens in dataset -- Signifiers of OWS movement, largely
addl_stop = {'â', 'giannis', 'banksters', 'classwarfare', 'financialterrorists', 
             'morganstanley', 'vote', 'banke', 'BankE', 'bankE'}

In [8]:
def exclude_stop_items(row, addl_stop):
    words = tokenize_words(row)
    if [x for x in words if x in addl_stop]:
        return False
    return True

In [9]:
no_stop = facebook['FullText'].apply(lambda x: exclude_stop_items(x, addl_stop))

Excluding these items reduced the size of the dataset by about 17%

In [10]:
facebook[no_stop].shape

(70209, 5)

#Topic Identification

We next used a few methods to explore the remaining data for key concepts to focus on in the second half of the analysis. Below we look at most commonly occurring tokens, as well as frequently occurring bigrams and trigrams, then apply some filtering based on part-of-speech patterns we identify as fruitful. For this section, we take about half of the total messages out of the Pandas DataFrame and into a list of lists.

In [11]:
raw_text = ' '.join(facebook[no_stop]['FullText'].tolist())[:7000000] # Half the dataset, roughly, for speed

In [12]:
words = tokenize_words(raw_text)

In [13]:
no_stop_list = [x for x in words if x not in stopwords.words('english') and x.isalpha()
              and x not in filter_stop]

In [14]:
fd = nltk.FreqDist(no_stop_list)
fd.most_common(10)

[('bank', 10672),
 ('account', 5290),
 ('get', 3332),
 ('money', 3177),
 ('new', 2822),
 ('like', 2423),
 ('one', 2319),
 ('got', 2288),
 ('card', 2242),
 ('financial', 2202)]

##Create bigrams from messages


While the most common tokens are interesting, they are not giving us anything specific to zero in on, so we looked at the top bigrams in terms of PMI and raw frequency. Please note, the next section runs fairly slowly.

In [15]:
finder = BigramCollocationFinder.from_words(no_stop_list)

## Locate most common bigrams by PMI and raw frequency

In [16]:
bigram_measures = BigramAssocMeasures()
top_pmi = finder.nbest(bigram_measures.pmi, 100)

In [17]:
top_raw = finder.nbest(bigram_measures.raw_freq, 100)

In [18]:
top_pmi[:10]

[('BankAales', 'sickens'),
 ('BankAcimaginedragons', 'imaginedragons'),
 ('BankAon', 'gianormous'),
 ('BankCcreditcardmemberprivilege', 'koskinlaser'),
 ('BankCfoundation', 'financialstability'),
 ('BankCthankyoucards', 'ajhudson'),
 ('BankDdesignation', 'develeporexperience'),
 ('BankDhashumanbeingemployees', 'byebyecreditcards'),
 ('BankDhomefinance', 'thanksmonique'),
 ('BankDisntinboston', 'BankDappredesignfail')]

In [19]:
top_raw[:10]

[('financial', 'advisers'),
 ('wealth', 'managers'),
 ('advisers', 'wealth'),
 ('asset', 'financial'),
 ('bank', 'bank'),
 ('chicago', 'marathon'),
 ('customer', 'service'),
 ('bank', 'account'),
 ('debit', 'card'),
 ('shared', 'photo')]

PMI is finding too specific of topics, but the top raw bigrams seems to be working very well. We hand identified the best topics from here, then looked back to see if they shared a common tag pattern that could be used to generalize into a list of topics.

In [22]:
tagger = create_backoff_tagger()

In [23]:
tagged_bgs = []
for bg in top_raw:
    tagged_bgs.append(tagger.tag(bg))
    
tagged_bgs[:15]

[[('financial', 'JJ'), ('advisers', 'NNS')],
 [('wealth', 'NN'), ('managers', 'NNS')],
 [('advisers', 'NNS'), ('wealth', 'NN')],
 [('asset', 'NN'), ('financial', 'JJ')],
 [('bank', 'NN'), ('bank', 'NN')],
 [('chicago', 'NN'), ('marathon', 'NN')],
 [('customer', 'NN'), ('service', 'NN')],
 [('bank', 'NN'), ('account', 'NN')],
 [('debit', 'NN'), ('card', 'NN')],
 [('shared', 'VBD'), ('photo', 'NN')],
 [('breaking', 'VBG'), ('news', 'NN')],
 [('rating', 'NN'), ('reiterated', 'VBN')],
 [('would', 'MD'), ('like', 'VB')],
 [('center', 'NN'), ('center', 'NN')],
 [('credit', 'NN'), ('card', 'NN')]]

##Identify common part-of-speech patterns in useful topics


In [24]:
good_patterns = {('JJ', 'NN'),
                 ('JJ', 'NNS'),
                 ('JJT', 'NN'),
                 ('NN', 'NN'),
                 ('NN', 'NNS'),
                 ('NNS', 'VB')}

##Find bigram collocations based on frequency and POS pattern


In [25]:
good_tagged_bgs = []
for bg in top_raw:
    tagged = tagger.tag(bg)
    pattern = (tagged[0][1], tagged[1][1])
    if pattern in good_patterns and tagged[0] != tagged[1]:
        good_tagged_bgs.append(tagger.tag(bg))
    
good_tagged_bgs

[[('financial', 'JJ'), ('advisers', 'NNS')],
 [('wealth', 'NN'), ('managers', 'NNS')],
 [('chicago', 'NN'), ('marathon', 'NN')],
 [('customer', 'NN'), ('service', 'NN')],
 [('bank', 'NN'), ('account', 'NN')],
 [('debit', 'NN'), ('card', 'NN')],
 [('credit', 'NN'), ('card', 'NN')],
 [('neutral', 'JJ'), ('rating', 'NN')],
 [('checking', 'NN'), ('account', 'NN')],
 [('data', 'NN'), ('breach', 'NN')],
 [('goldman', 'NN'), ('sachs', 'NN')],
 [('overweight', 'NN'), ('rating', 'NN')],
 [('gon', 'NN'), ('na', 'NN')],
 [('good', 'JJ'), ('morning', 'NN')],
 [('new', 'JJ'), ('photos', 'NN')],
 [('wan', 'NN'), ('na', 'NN')],
 [('marathon', 'NN'), ('chicago', 'NN')],
 [('close', 'JJ'), ('account', 'NN')],
 [('bank', 'NN'), ('robbery', 'NN')],
 [('small', 'JJ'), ('business', 'NN')],
 [('new', 'JJ'), ('bank', 'NN')],
 [('account', 'NN'), ('bank', 'NN')],
 [('financial', 'JJ'), ('crisis', 'NN')],
 [('worst', 'JJT'), ('bank', 'NN')],
 [('money', 'NN'), ('account', 'NN')],
 [('tickets', 'NNS'), ('see', 

In [26]:
len(good_tagged_bgs)

48

Applying the tagging reduced our total list to 48, a manageable number to see what additional topics may be relevant

#Cross Tabulation

## Identify which topic is being discussed in each message and add to DataFrame

Further analysis on groups of messages containing these bigrams proves challenging because the main takeaway is that all of them contain the same bigrams. We added columns to our dataframes to
1. Make it easier to access the messages from a given topic
2. Filter out the actual topic being referred to, in order to make it easier to analyze the substance of the rest of the message

In [27]:
def add_column(row, bigram):
    bg = [x[0] for x in bigram] # Remove POS tags
    if ' '.join(bg) in row:
        words = tokenize_words(row.replace(' '.join(bg), ''))
        tagged_words = tagger.tag(words)
        return ' '.join([x[0] for x in tagged_words if x[0] not in filter_stop])
                        
    elif ''.join(bg) in row:
        words = tokenize_words(row.replace(''.join(bg), ''))
        tagged_words = tagger.tag(words)
        return ' '.join([x[0] for x in tagged_words if x[0] not in filter_stop])
    else:
        return False
                

In [28]:
topics = []
for bigram in good_tagged_bgs:
    bg = '_'.join([x[0] for x in bigram]) # Column name is bigram separated by _
    facebook[bg] = facebook['FullText'].apply(add_column, args=(bigram,))
    topics.append(bg)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


##Identify which bank is being discussed in each message and add to DataFrame

In [29]:
good_topics = []
for item in good_tagged_bgs:
    good_topics.append(item[0][0] + "_" + item[1][0])

topics_of_interest = good_topics
fb_subset = facebook.ix[:,["FullText"] + topics_of_interest]
for topic in topics_of_interest:
    fb_subset["num_" + topic] = fb_subset[topic].apply(lambda x: 1 if x else 0)

    
def find_bank(in_str):
    banks = {
            "bank_a": ["twit_hndl_BankA", "BankA"],
            "bank_b": ["twit_hndl_BankB", "BankB"],
            "bank_c": ["twit_hndl_BankC", "BankC"],
            "bank_d": ["twit_hndl_BankD", "BankD"],
    }
    
    # assumes only one bank per comment
    for bank, idents in banks.items():
        for ident in idents:
            if ident.lower() in in_str.lower():
                return bank
    
    return None

fb_subset["bank"] = fb_subset["FullText"].apply(lambda x: find_bank(x))


##Cross-tabulate messages by topic and bank and graphical analysis in Tableau


In [30]:
fb_subset.to_csv("viz_data_1.csv")

#Substance Identification

In [31]:
# Since this code is fairly slow, we are only running it here 
# against the hand-drawn list of topics used in our write-up
revised_topics = ['financial_advisers',
                  'wealth_managers',
                  'customer_service',
                  'bank_account',
                  'debit_card',
                  'credit_card',
                  'checking_account',
                  'data_breach',
                  'bank_robbery',
                  'small_business',
                  'real_estate',
                  'close_account',
                  ]

In [32]:
# Analysis inspired by http://aylien.com/web-summit-2015-tweets-part1

substances = {} # Dictionary to store the top unique words from each cluster for each topic

for topic in revised_topics: # To run against the full list, iterate over topics instead of revised_topics
    print(topic)
    substances[topic] = {}
    
    data = facebook[facebook[topic] != False] # Get the rows that correspond with this topic
    
    # Convert messages within topic to matrix of token counts
    cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
    cvz = cvectorizer.fit_transform(data[topic])

    # Fit LDA clustering model
    n_topics = 10
    n_iter = 3000
    lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
    X_topics = lda_model.fit_transform(cvz)
    
    
    # Iterate over the clusters created within the topic and find the distinguishing words 
    n_top_words = 15
    topic_words = lda_model.topic_word_  
    vocab = cvectorizer.get_feature_names()


    # Add the distinguishing words to a list
    topic_summaries = []
    for i, topic_dist in enumerate(topic_words):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        topic_summaries.append(' '.join(topic_words))
    
    
    # Flatten all topic descriptors to bag of words
    all_words = ' '.join(x for x in topic_summaries) 
    counts = Counter(all_words.split(' ')) # Create a counter of words in the bag
    
    # Add unique summary words to dictionary to better identify each topic by the distinguishing
    # words it does not share with other topic descriptions
    for i, summary in enumerate(topic_summaries):
        substances[topic][i] = [x for x in summary.split(' ') if counts[x] == 1] 




financial_advisers
wealth_managers
customer_service




bank_account




debit_card




credit_card




checking_account




data_breach




bank_robbery




small_business
real_estate




close_account






Please note: The dictionary of topics and substances can be viewed in the variable substances or by exporting to csv below

## Output substances to csv for exploration

In [33]:
sf = pd.DataFrame.from_dict(substances).transpose().stack()
sf.to_csv('output.csv')
