In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from tabulate import tabulate
import pickle

## Helper Functions

In [2]:
from nltk.corpus import wordnet as wn, brown
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

def filter_terms(in_str, sub_list=None):
    if sub_list is None:
        return in_str

    for pattern in sub_list:
        in_str = re.sub("\\b" + pattern + "\\b", '', in_str)

    return in_str

def normalize(word):
    """Normalizes words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

def leaves(tree, phrase):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()==phrase):
        yield subtree.leaves()
        
def get_terms(tree, phrase):
    for leaf in leaves(tree, phrase):
        term = [ w.lower() for w,t in leaf if w ]
        yield term

# POS Tagger

The default pos_tag method from nltk is too slow.

Training a new backoff tagger with the brown brown corpus.

In [3]:
from nltk.corpus import brown

default_tagger = nltk.DefaultTagger('NN')

brown_tagged_sents = brown.tagged_sents()

def build_backoff_tagger(train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    return t3

# Pickle the backoff tagger so that we don't have to retrain
try:
    with open('ngram_tagger.pickle', 'rb') as f:
        ngram_tagger = pickle.load(f)
    ngram_tagger
except NameError:
    ngram_tagger = build_backoff_tagger(brown_tagged_sents)
    with open('ngram_tagger.pickle', 'wb') as f:
        pickle.dump(ngram_tagger, f)

## Data Prep

In [4]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="ISO-8859-1", index_col="AutoID")
# Remove non text chars
dataset["FullText"] = dataset["FullText"].apply(lambda x: (re.sub('[^a-zA-Z ]+', '', x).strip()))

# We only care about banks a-d
# every other bank is irrelevant
relevant_banks = ["twit_hndl_BankA", "twit_hndl_BankB", 
                  "twit_hndl_BankC", "twit_hndl_BankD", 
                  "BankA", "BankB", "BankC", "BankD"]

# Filter the dataset for only the banks that we care about
print("before:", len(dataset))
dataset = dataset[dataset["FullText"].apply(lambda x: any(substring in x for substring in relevant_banks))]
print("after:", len(dataset))
print(dataset.head())

# Filter just Facebook data
fb_mask = (dataset["MediaType"] == "facebook")
fb_data = dataset[fb_mask].reset_index()
fb_data.set_index("AutoID", inplace=True)

before: 220377
after: 192180
             Date  Year  Month MediaType  \
AutoID                                     
1       8/26/2015  2015      8   twitter   
2        8/5/2015  2015      8   twitter   
3       8/12/2015  2015      8   twitter   
4        8/5/2015  2015      8   twitter   
5       8/12/2015  2015      8   twitter   

                                                 FullText  
AutoID                                                     
1       ways the internet of things will change BankB ...  
2       BankB BankB Name downgrades apple stock to neu...  
3             BankB returns to profit on INTERNET board t  
4       BankB tells advisers to exit paulson hedge fun...  
5       BankC may plead guilty over foreign exchange p...  


## Use Topics from Bigram Collocations to Tag Messages 

In [5]:
# Assign Messages to Banks
fb_data["BankA"] = fb_data["FullText"].str.strip().str.lower().str.contains("banka|bank a", regex=True)
fb_data["BankB"] = fb_data["FullText"].str.strip().str.lower().str.contains("bankb|bank b", regex=True)
fb_data["BankC"] = fb_data["FullText"].str.strip().str.lower().str.contains("bankc|bank c", regex=True)
fb_data["BankD"] = fb_data["FullText"].str.strip().str.lower().str.contains("bankd|bank d", regex=True)

# Assign Topics to Banks
# Customer Service
fb_data["Customer Service"] = fb_data["FullText"].str.strip().str.lower().str.contains("customer service", regex=True)
fb_data["Close Account"] = fb_data["FullText"].str.strip().str.lower().str.contains("close account", regex=True)
fb_data["New Bank"] = fb_data["FullText"].str.strip().str.lower().str.contains("new bank", regex=True)

# Adivising
fb_data["Financial Advisers"] = fb_data["FullText"].str.strip().str.lower().str.contains("financial advisers", regex=True)
fb_data["Wealth Manager"] = fb_data["FullText"].str.strip().str.lower().str.contains("wealth managers", regex=True)
fb_data["Bank Teller"] = fb_data["FullText"].str.strip().str.lower().str.contains("bank teller", regex=True)

# Product
fb_data["Bank Account"] = fb_data["FullText"].str.strip().str.lower().str.contains("bank account", regex=True)
fb_data["Checking Account"] = fb_data["FullText"].str.strip().str.lower().str.contains("checking account", regex=True)
fb_data["Credit Card"] = fb_data["FullText"].str.strip().str.lower().str.contains("credit card", regex=True)
fb_data["Debit Card"] = fb_data["FullText"].str.strip().str.lower().str.contains("debit card", regex=True)
fb_data["Account Bank"] = fb_data["FullText"].str.strip().str.lower().str.contains("account bank", regex=True)
fb_data["Money Account"] = fb_data["FullText"].str.strip().str.lower().str.contains("money account", regex=True)
fb_data["Bank Card"] = fb_data["FullText"].str.strip().str.lower().str.contains("bank card", regex=True)
fb_data["Savings Account"] = fb_data["FullText"].str.strip().str.lower().str.contains("savings account", regex=True)

In [6]:
fb_data = fb_data[fb_data[["Customer Service", "Close Account", "New Bank", "Financial Advisers", 
                           "Wealth Manager", "Bank Teller", "Bank Account", "Checking Account", 
                           "Credit Card", "Debit Card", "Account Bank", "Money Account", "Bank Card", 
                           "Savings Account"]].any(1)]

## Define Chunking Functions

In [8]:
def get_noun_phrases(some_text):
    # Define Chunker Rules
    # Attempt rules individually to evaluate
    # Taken from Su Nam Kim Paper...
    #     {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
    #     {<VB.*>+<RB><JJ.*>+}
    #     {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
    #     {<PRP>?<NN.*>+<VB.*><DT|RB>?<JJ.*>+<CC>?<JJ>+?}
    #        {<DT.*><WP.*><VBP>*<RB>*<VBN><IN><NN>}
    #         {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
    #         {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
    #         {<JJ>*<NN|NNS|NNP|NNPS>+}
    #         {<CS.*|RB.*><PP$><BEDZ><RP>?<VB.*><NN.*>+}
    noun_chunker = nltk.RegexpParser('''
        NP:
        # hoping to find things like:
        # "customer service is the worst"
        # "atms are horrible"
        {(<NN.*|N.*>+<BE.*><DT>?<JJ.*>)}
    ''')
    noun_phrases = []
    tree = noun_chunker.parse(some_text)

    noun_phrase = [w for w in get_terms(tree, 'NP')]
    if len(noun_phrase) > 0:
        for i, phrase in enumerate(noun_phrase):
            noun_phrases.append(" ".join(phrase))
    return noun_phrases
    
def get_adj_phrases(some_text):
    # Define Chunker Rules
    noun_chunker = nltk.RegexpParser('''
        AJ:
            {<JJ.*><NN.*>+}
            {<JJ.*>}
    ''')
    adj_phrases = []
    tree = noun_chunker.parse(some_text)

    adj_phrase = [w for w in get_terms(tree, 'AJ')]
    if len(adj_phrase) > 0:
        for i, phrase in enumerate(adj_phrase):
            adj_phrases.append(" ".join(phrase))
    return adj_phrases

## Chunk Messages and Attach to Dataframe

In [52]:
fb_data["tagged"] = fb_data["FullText"].apply(lambda x: ngram_tagger.tag(nltk.tokenize.word_tokenize(x)))
fb_data["noun_phrases"] = fb_data["tagged"].apply(lambda x: ",".join(get_noun_phrases(x)))
# fb_data["noun_phrases"][fb_data["noun_phrases"].apply(len) > 0]

In [53]:
fb_data["tagged"] = fb_data["FullText"].apply(lambda x: ngram_tagger.tag(nltk.tokenize.word_tokenize(x)))
fb_data["adj_phrases"] = fb_data["tagged"].apply(lambda x: ",".join(get_adj_phrases(x)))
# fb_data["adj_phrases"][fb_data["adj_phrases"].apply(len) > 0]

In [None]:
# saving to use for viz
fb_data.to_csv("data/fb_data.csv")

In [36]:
# for exploration tool
fb_data.reset_index().to_json("/Users/vijayv/code/sound_cloud_analytics/project/static/data/fb_data.json", orient="index")

# Warning: 

## Code below here is very exploratory and is written very verbosely

## High Level Topics

Previously used to pickle tagged data to be reused later by the keyphrase extractor notebook.

In [70]:
# Add topics here
groups = {
    "customer_service": np.array(["customer service", "close account", "new bank"]),
    "advising": np.array(["financial advisers", "wealth managers", "bank teller"]),
    "product": np.array(["bank account", "checking account", "credit card", "debit card", "account bank", "money account", "bank card", "savings account"]),
}

for topic_name, topic in groups.items():
    pattern_string = "|".join(topic)
    pattern = re.compile(pattern_string)
    filters = fb_data["FullText"].str.contains(pattern, regex=True)
    df = fb_data[filters]
    # Remove any data with an internet address
    df = df[~df["FullText"].str.contains("INTERNET")]
    df.to_csv("data/" + topic_name + ".csv", index=False)
    # POS tag tokens from comments
    df_tagged = np.array([ngram_tagger.tag(nltk.tokenize.word_tokenize(comment)) for comment in df["FullText"]])
    with open("data/" + topic_name + '.pickle', 'wb') as f:
        pickle.dump(df_tagged, f)

## Group: Customer Service

In [9]:
df_customer_service = pd.read_csv("data/customer_service.csv", index_col="AutoID")
df_customer_service.describe()

Unnamed: 0,Year,Month
count,1243.0,1243.0
mean,2014.50362,7.122285
std,0.500188,3.459847
min,2014.0,1.0
25%,2014.0,4.0
50%,2015.0,8.0
75%,2015.0,10.0
max,2015.0,12.0


In [10]:
df_cs = df_customer_service[df_customer_service["FullText"].str.contains("customer service")]
df_close_account = df_customer_service[df_customer_service["FullText"].str.contains("close account")]
df_new_bank = df_customer_service[df_customer_service["FullText"].str.contains("new bank")]
print(df_close_account.describe())

              Year     Month
count    10.000000  10.00000
mean   2014.400000   8.10000
std       0.516398   2.84605
min    2014.000000   3.00000
25%    2014.000000   6.00000
50%    2014.000000   8.50000
75%    2015.000000  10.50000
max    2015.000000  12.00000


In [11]:
df_cs_tagged = np.array([ngram_tagger.tag(nltk.tokenize.word_tokenize(comment)) for comment in df_cs["FullText"]])
df_close_account_tagged = np.array([ngram_tagger.tag(nltk.tokenize.word_tokenize(comment)) for comment in df_close_account["FullText"]])
df_new_bank_account_tagged = np.array([ngram_tagger.tag(nltk.tokenize.word_tokenize(comment)) for comment in df_new_bank["FullText"]])
picked_data = df_new_bank_account_tagged

In [12]:
try:
    # data/bank_services.pickle or data/customer_service.pickle
    with open('data/customer_service.pickle', 'rb') as f:
        picked_data = pickle.load(f)
except NameError as e:
    print(e)

In [13]:
# Define Chunker Rules
# Taken from Su Nam Kim Paper...
# Modified for brown tagset
#     {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
#     {<VB.*>+<RB><JJ.*>+}
#     {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
#     Work with Penn Treebank
#     {<PRP>?<NN.*>+<VB.*><DT|RB>?<JJ.*>+<CC>?<JJ>+?}
#        {<DT.*><WP.*><VBP>*<RB>*<VBN><IN><NN>}
#         {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
#         {<JJ>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
#         {<JJ>*<NN|NNS|NNP|NNPS>+}
#         {<CS.*|RB.*><PP$><BEDZ><RP>?<VB.*><NN.*>+}
noun_chunker = nltk.RegexpParser('''
    NP:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
        {<VB.*>+<RB><JJ.*>+}
    AJ:
        {<JJ.*>}
''')

# Key Phrases from Facebook
noun_phrases = []
adj_phrases = []
for i, comment in enumerate(picked_data):
    tree = noun_chunker.parse(comment)
    
    noun_phrase = [w for w in get_terms(tree, 'NP')]
    if len(noun_phrase) > 0:
        for i, phrase in enumerate(noun_phrase):
            noun_phrases.append(phrase)
    
    adj_phrase = [w for w in get_terms(tree, 'AJ')]
    if len(adj_phrase) > 0:
        for i, phrase in enumerate(adj_phrase):
            adj_phrases.append(phrase)

In [14]:
for noun_phrase in noun_phrases:
    if noun_phrase:
        print(noun_phrase)
        print()

['informationbankb', 'customer', 'service', 'rep', 'mr']

['whats']

['name']

['creditor']

['youre']

['stop', 'payment']

['ron', 'don', 'don', 'child', 'supportshe', 'stops']

['strbankeht']

['eyes']

['eyesmr']

['cant']

['hold']

['child', 'support']

['don', 'i', 'didnt']

['sosmilingof', 'course', 'i']

['i']

['transaction', 'happy', 'hump', 'day']

['twithndl', 'provocateur']

['worlds']

['customer', 'service']

['bankd']

['exchange', 'theyd']

['return']

['replacement']

['day', 'courier']

['twithndlbankb']

['service', 'wasnt']

['robotic']

['purpose']

['humans']

['family']

['friends']

['account']

['experience']

['view']

['twithndlbankb']

['page']

['complaints']

['complaints']

['customer', 'service']

['i']

['i']

['st', 'hand', 'bankb']

['shits']

['customers']

['customers', 'theyve']

['yrs']

['twithndlbankb']

['i']

['fraudulent', 'card']

['paycheck']

['pay', 'card']

['fraud', 'i']

['account']

['bankb']

['customer', 'service', 'representative

In [15]:
# Look for words that indicate useful replies such as because or due to
df_cs_because = df_cs[df_cs["FullText"].str.contains("because|due to", regex=True)]
for text in df_cs_because["FullText"].iloc[:100]:
    print(text.strip())
    print(nltk.pos_tag(text.strip().split()))
    print()

years ago i went with BankB because of the convenience that it afforded me the multiple locations not only here loBankC anke but banke in addition to just being able to have good customer service all the way around this company has changed so dramatiBankC anke in the last  years that it just makes a person sick by the way i have to past several different banking companies in order to go to a BankB and all of them have drive throughs that arm and daily they also have more convenient hours as those drive thru is operating while i sat in the BankB parking lot for  minutes waiting for the branch to open hopefully if enough people leave your company based on these decisions BankB will return to the way it used to be
[('years', 'NNS'), ('ago', 'IN'), ('i', 'PRP'), ('went', 'VBD'), ('with', 'IN'), ('BankB', 'NNP'), ('because', 'IN'), ('of', 'IN'), ('the', 'DT'), ('convenience', 'NN'), ('that', 'IN'), ('it', 'PRP'), ('afforded', 'VBD'), ('me', 'PRP'), ('the', 'DT'), ('multiple', 'NN'), ('locat

In [16]:
# customer service
fd1 = nltk.FreqDist([adj[0] if len(adj) > 0 else '' for adj in adj_phrases])
fd1.most_common()

[('worst', 165),
 ('worse', 36),
 ('happy', 33),
 ('able', 33),
 ('best', 33),
 ('due', 28),
 ('open', 24),
 ('better', 23),
 ('sure', 22),
 ('wont', 22),
 ('rude', 22),
 ('good', 21),
 ('new', 20),
 ('horrible', 18),
 ('terrible', 18),
 ('bad', 17),
 ('close', 15),
 ('ridiculous', 14),
 ('nice', 14),
 ('such', 13),
 ('sorry', 13),
 ('personal', 13),
 ('sad', 12),
 ('helpful', 12),
 ('wrong', 12),
 ('long', 11),
 ('ill', 11),
 ('amazing', 10),
 ('ready', 10),
 ('full', 9),
 ('willing', 9),
 ('longer', 7),
 ('different', 7),
 ('possible', 7),
 ('poor', 7),
 ('awesome', 7),
 ('great', 7),
 ('sick', 7),
 ('available', 7),
 ('quick', 6),
 ('negative', 6),
 ('glad', 6),
 ('absolute', 6),
 ('professional', 6),
 ('clear', 6),
 ('real', 6),
 ('thankful', 5),
 ('stupid', 5),
 ('higher', 5),
 ('grateful', 5),
 ('big', 5),
 ('unhappy', 5),
 ('free', 5),
 ('true', 4),
 ('easy', 4),
 ('everyday', 4),
 ('worthless', 4),
 ('own', 4),
 ('correct', 4),
 ('welcome', 4),
 ('human', 4),
 ('angry', 4),
 ('

In [17]:
# close account
fd1 = nltk.FreqDist([adj[0] if len(adj) > 0 else '' for adj in adj_phrases])
fd1.most_common()

[('worst', 165),
 ('worse', 36),
 ('happy', 33),
 ('able', 33),
 ('best', 33),
 ('due', 28),
 ('open', 24),
 ('better', 23),
 ('sure', 22),
 ('wont', 22),
 ('rude', 22),
 ('good', 21),
 ('new', 20),
 ('horrible', 18),
 ('terrible', 18),
 ('bad', 17),
 ('close', 15),
 ('ridiculous', 14),
 ('nice', 14),
 ('such', 13),
 ('sorry', 13),
 ('personal', 13),
 ('sad', 12),
 ('helpful', 12),
 ('wrong', 12),
 ('long', 11),
 ('ill', 11),
 ('amazing', 10),
 ('ready', 10),
 ('full', 9),
 ('willing', 9),
 ('longer', 7),
 ('different', 7),
 ('possible', 7),
 ('poor', 7),
 ('awesome', 7),
 ('great', 7),
 ('sick', 7),
 ('available', 7),
 ('quick', 6),
 ('negative', 6),
 ('glad', 6),
 ('absolute', 6),
 ('professional', 6),
 ('clear', 6),
 ('real', 6),
 ('thankful', 5),
 ('stupid', 5),
 ('higher', 5),
 ('grateful', 5),
 ('big', 5),
 ('unhappy', 5),
 ('free', 5),
 ('true', 4),
 ('easy', 4),
 ('everyday', 4),
 ('worthless', 4),
 ('own', 4),
 ('correct', 4),
 ('welcome', 4),
 ('human', 4),
 ('angry', 4),
 ('

In [18]:
# new bank
fd3 = nltk.FreqDist([adj[0] if len(adj) > 0 else '' for adj in adj_phrases])
fd3.most_common()

[('worst', 165),
 ('worse', 36),
 ('happy', 33),
 ('able', 33),
 ('best', 33),
 ('due', 28),
 ('open', 24),
 ('better', 23),
 ('sure', 22),
 ('wont', 22),
 ('rude', 22),
 ('good', 21),
 ('new', 20),
 ('horrible', 18),
 ('terrible', 18),
 ('bad', 17),
 ('close', 15),
 ('ridiculous', 14),
 ('nice', 14),
 ('such', 13),
 ('sorry', 13),
 ('personal', 13),
 ('sad', 12),
 ('helpful', 12),
 ('wrong', 12),
 ('long', 11),
 ('ill', 11),
 ('amazing', 10),
 ('ready', 10),
 ('full', 9),
 ('willing', 9),
 ('longer', 7),
 ('different', 7),
 ('possible', 7),
 ('poor', 7),
 ('awesome', 7),
 ('great', 7),
 ('sick', 7),
 ('available', 7),
 ('quick', 6),
 ('negative', 6),
 ('glad', 6),
 ('absolute', 6),
 ('professional', 6),
 ('clear', 6),
 ('real', 6),
 ('thankful', 5),
 ('stupid', 5),
 ('higher', 5),
 ('grateful', 5),
 ('big', 5),
 ('unhappy', 5),
 ('free', 5),
 ('true', 4),
 ('easy', 4),
 ('everyday', 4),
 ('worthless', 4),
 ('own', 4),
 ('correct', 4),
 ('welcome', 4),
 ('human', 4),
 ('angry', 4),
 ('

## Bank Advisory Services

In [19]:
df_bank_advising = pd.read_csv("data/advising.csv", index_col="AutoID")
print(df_bank_advising.describe())
df_bank_advising.head()

              Year      Month
count    80.000000  80.000000
mean   2014.287500   7.775000
std       0.455452   3.939334
min    2014.000000   1.000000
25%    2014.000000   3.000000
50%    2014.000000   9.000000
75%    2015.000000  11.000000
max    2015.000000  12.000000


Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
61278,8/28/2014,2014,8,facebook,customers in line and bank teller i hate Bank...
62051,2/10/2015,2015,2,facebook,Name you Nameor Namea bank teller Name reminds...
68422,8/5/2014,2014,8,facebook,anyone interested in a hr bank teller positio...
68850,12/26/2014,2014,12,facebook,apparently eating a sweet potato while driving...
70330,11/22/2014,2014,11,facebook,at BankB line is almost out the door been stan...


In [20]:
for text in df_bank_advising["FullText"].iloc[:100]:
    print(text.strip())
    print(nltk.pos_tag(text.strip().split()))
    print()

customers in line and  bank teller i hate BankB in wilmington
[('customers', 'NNS'), ('in', 'IN'), ('line', 'NN'), ('and', 'CC'), ('bank', 'NN'), ('teller', 'NN'), ('i', 'PRP'), ('hate', 'VBP'), ('BankB', 'NNP'), ('in', 'IN'), ('wilmington', 'NN')]

Name you Nameor Namea bank teller Name reminds me of BankD and Name bank days community  Name you Nameor Namea bank teller community  Name you Nameor Namea bank teller
[('Name', 'NNP'), ('you', 'PRP'), ('Nameor', 'NNP'), ('Namea', 'NNP'), ('bank', 'NN'), ('teller', 'NN'), ('Name', 'NNP'), ('reminds', 'NNS'), ('me', 'PRP'), ('of', 'IN'), ('BankD', 'NNP'), ('and', 'CC'), ('Name', 'NNP'), ('bank', 'NN'), ('days', 'NNS'), ('community', 'NN'), ('Name', 'NNP'), ('you', 'PRP'), ('Nameor', 'NNP'), ('Namea', 'NNP'), ('bank', 'NN'), ('teller', 'NN'), ('community', 'NN'), ('Name', 'NNP'), ('you', 'PRP'), ('Nameor', 'NNP'), ('Namea', 'NNP'), ('bank', 'NN'), ('teller', 'NN')]

anyone interested in a  hr bank teller position for BankD bank message me asa

In [21]:
try:
    with open('data/advising.pickle', 'rb') as f:
        picked_data = pickle.load(f)
except NameError as e:
    print(e)

In [22]:
# Define Chunker Rules
# Taken from Su Nam Kim Paper...
#     {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
#     {<VB.*>+<RB><JJ.*>+}
#     {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
noun_chunker = nltk.RegexpParser('''
    NP:
        {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
        {<VB.*>+<RB><JJ.*>+}
    AJ:
        {<NN.*|N.*>+?<JJ.*>+<NN.*|N.*>+?}
''')

# Key Phrases from Facebook
noun_phrases = []
adj_phrases = []
for i, comment in enumerate(picked_data):
    tree = noun_chunker.parse(comment)
    
    noun_phrase = [w for w in get_terms(tree, 'NP')]
    if len(noun_phrase) > 0:
        for i, phrase in enumerate(noun_phrase):
            noun_phrases.append(phrase)
    
    adj_phrase = [w for w in get_terms(tree, 'AJ')]
    if len(adj_phrase) > 0:
        for i, phrase in enumerate(adj_phrase):
            adj_phrases.append(phrase)

In [23]:
noun_phrases[:100]

[]

In [24]:
adj_phrases[:100]

[['rebanke', 'everyday', 'times'],
 ['faces', 'tough', 'new', 'broadband'],
 ['faces', 'tough', 'new', 'broadband'],
 ['tests',
  'bankd',
  'asset',
  'management',
  'stockport',
  'financial',
  'advisers'],
 ['ambassador',
  'bankd',
  'asset',
  'management',
  'stockport',
  'financial',
  'advisers'],
 ['f', 'bankd', 'worst', 'customer'],
 ['freakin', 'happy', 'november'],
 ['court',
  'bankd',
  'asset',
  'management',
  'stockport',
  'financial',
  'advisers'],
 ['ur', 'own', 'bank'],
 ['ur', 'own', 'rates'],
 ['god', 'damn', 'well'],
 ['i', 'wouldnt', 'wan', 'na'],
 ['needs', 'extra', 'training'],
 ['banka', 'bad', 'reputation'],
 ['math', 'typical', 'banka'],
 ['bankay', 'modern', 'videos'],
 ['reach',
  'bankd',
  'asset',
  'management',
  'stockport',
  'financial',
  'advisers'],
 ['customers', 'confidential', 'information'],
 ['account', 'fb', 'new', 'account'],
 ['day', 'glad', 'i'],
 ['loan',
  'bankd',
  'asset',
  'management',
  'stockport',
  'financial',
  'adv

## Bank Products

In [25]:
df_bank_products = pd.read_csv("data/product.csv", index_col="AutoID")
print(df_bank_products.describe())
df_bank_products.head()

              Year        Month
count  3039.000000  3039.000000
mean   2014.332675     7.712405
std       0.471249     3.637664
min    2014.000000     1.000000
25%    2014.000000     5.000000
50%    2014.000000     9.000000
75%    2015.000000    10.000000
max    2015.000000    12.000000


Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
79,1/1/2015,2015,1,facebook,if niggas keep inboxing w they Name shit all l...
93,11/18/2014,2014,11,facebook,swear i go Name Name Name Name i come in Name ...
108,4/3/2015,2015,4,facebook,if anyone has a BankA account i would strongly...
165,9/5/2014,2014,9,facebook,kudos todayto my financial institution BankD b...
431,4/6/2015,2015,4,facebook,if u have any active bank active bank account ...


In [26]:
for text in df_bank_products["FullText"].iloc[:100]:
    print(text.strip())

if niggas keep inboxing w they Name shit all like if i got a fuvking BankD bank card card Name gon violate like stfu b       fuck man
swear i go Name Name Name Name i come in Name Name off days Name Name BankD Name  Name hour Name Name i see Name bank account Name makes it Name worth Name
if anyone has a BankA account i would strongly recommend you checking your bank account to make sure that you do not have any unauthorized transactions going to paypal i had money with drawn from my account via paypal to someone i dont evem kno
kudos todayto my financial institution BankD bank they caught over half a dozen fraudulent purBankDs on my credit card this morning they had to cancel the card because of it hopefully they will catch the Name Name Name fingers are crossed Namei have not been in a home depot in over a yearName knows
if u have any active bank active bank account Name credit union union account and want to make  quick and   guaranteed add Name me or message me my brother works in 

In [27]:
try:
    with open('data/product.pickle', 'rb') as f:
        picked_data = pickle.load(f)
except NameError as e:
    print(e)

In [28]:
# Define Chunker Rules
# Taken from Su Nam Kim Paper...
#     {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
#     {<VB.*>+<RB><JJ.*>+}
#     {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
noun_chunker = nltk.RegexpParser('''
    NP:
        {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
        {<VB.*>+<RB><JJ.*>+}
    AJ:
        {<NN.*|N.*>+?<JJ.*>+<NN.*|N.*>+?}
''')

# Key Phrases from Facebook
noun_phrases = []
adj_phrases = []
for i, comment in enumerate(picked_data):
    tree = noun_chunker.parse(comment)
    
    noun_phrase = [w for w in get_terms(tree, 'NP')]
    if len(noun_phrase) > 0:
        for i, phrase in enumerate(noun_phrase):
            noun_phrases.append(phrase)
    
    adj_phrase = [w for w in get_terms(tree, 'AJ')]
    if len(adj_phrase) > 0:
        for i, phrase in enumerate(adj_phrase):
            adj_phrases.append(phrase)