In [53]:
import nltk, re, math, collections, lda, random
import numpy as np
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import time


LANGUAGE = "english"
STOPWORD_SET = (set(nltk.corpus.stopwords.words(LANGUAGE)) | set(['even', '.', ','])) - set(['who', 'why', 'how', 'where', 'when', 'what', 'whom']) 
FILENAME = "comments_raw_merged_with_page_data_190716.csv"
COLUMN_NAME = "message"

def is_ASCII_string(string):
    '''
        Checks if string is ASCII-decodeable. Always run on pre-processed csv files!
    '''
    try:
        string.decode('ascii')
        return True
    except:
        return False

def ascii_substituted(string):
    '''
        Substitutes string to make it ASCII-readable. Currently very exception-driven, 
        but if there's a good package that does the dirty work please let @Lumpy know.
    '''
    return string.replace("\\n", " ").replace("&amp;", "&").replace('&#039;', '\'').replace("&quot;", "\"").replace("&lt;", "<").replace("&gt;", ">").replace("\xe2\x80\x93", "-").replace("\xe2\x80\x99", "\'").strip()


### CLEAN YOUR DF!!
def get_message_list(data_frame, column_name = "message"):
    '''
        Given a pre-CLEANED and pre-subsetted data frame, gets the message set from the data frame.
    '''
    return [message for message_l in data_frame[[column_name]].values.tolist() for message in message_l]            


def is_eligible_word(token, stopwords = STOPWORD_SET, regex_string = "^[^a-zA-Z0-9]+"):
    '''
        Private helper function.
        Check if word is eligible to be a token (i.e. not a forbidden regex, or in the stopword set).
    '''
    pattern = re.compile(regex_string)
    if pattern.match(token) or token in stopwords: 
        return False
    return True


def append_NOTs(tokenized_message, stopwords = STOPWORD_SET):
    '''
        Private helper function to handle negations.
        Given a List of words, returns a List of words with nots appended to the right words.
    '''
    new_message = []
    for i in range(len(tokenized_message)):
        if tokenized_message[i] in set(["n\'t", "n\"t", "no", "not", "didnt"]) and i != (len(tokenized_message) - 1):
            j = i + 1
            while j < len(tokenized_message):
                if tokenized_message[j] not in stopwords:
                    tokenized_message[j] = "not_" + tokenized_message[j]
                    break
                j += 1
        else:
            new_message.append(tokenized_message[i])
    return new_message


def split_message_into_tokens(untokenized_message, ngram = 1, stopwords = STOPWORD_SET, regex_string = "^[^a-zA-Z0-9]+", handle_negations = True): 
    '''
        Splits ONE message in the list of messages into words/tokens, in the process doing the following:
        1) Changing to lowercase
        2) Dealing with negations
        3) N-gramming
        4) Lemmatizing
        5) Stopword-removal
        Input: String.
        Arguments: 
            ngram:              n in n-grams                        Default is 1
            stopwords           Set of stopwords to remove          Default is the global variable
            regex_string        String containing regex pattern     Default is "^[^a-zA-Z0-9]+"
            handle_negations:   Do you want negations handled?      Default is True
        Output: List of n-grams.
    '''
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
    # Split into tokens
    tokenized_message = nltk.word_tokenize(untokenized_message.lower())
    # Deal with "not"s
    if handle_negations:
        tokenized_message = append_NOTs(tokenized_message, stopwords)
    # Lemmatize, n-gram, stopword-removal.
    if ngram == 1:
        return [wordnet_lemmatizer.lemmatize(token) for token in tokenized_message if is_eligible_word(token, stopwords, regex_string)]
    else:
        tokenized_message = ngrams(tokenized_message, ngram)
        return [[wordnet_lemmatizer.lemmatize(token) for token in ngram_indiv if is_eligible_word(token, stopwords, regex_string) not in stopwords] for ngram_indiv in tokenized_message if len(ngram_indiv) > 0]


def tokenize(message_list, ngram = 1, stopwords = STOPWORD_SET, regex_string = "^[^?a-zA-Z0-9]+", handle_negations = True, pos_tagset = "no-pos"):
    '''
        Tokenizes a list of strings. Assumes that message_list has been preprocessed.
        Input: List of Strings.
        Arguments:
            ngram               n in n-grams                                                                                Default is 1
            stopwords:          Set of stopwords to remove.                                                                 Default is the global variable.
            regex_string        String containing regex pattern                                                             Default is "^[^a-zA-Z0-9]+"
            handle_negations:   Do you want "not X" to be glued as "not_X"? Improves sentiment recognition.                 Default is True
            pos_tagset:         For POS tagging. Current options are "universal", None (which maps to nltk.pos_tag) 
                                and "no-pos", which really means no POS tagging. Only applicable for unigrams.              Default is None
        Output: List of List of Strings.
        Note: will automatically change to lowercase, and will automatically lemmatize.
    '''
    if ngram == 1 and (pos_tagset == None or pos_tagset == "universal"): ## Will perform POS-tagging
        return [nltk.pos_tag(split_message_into_tokens(untokenized_message, ngram = ngram, stopwords = stopwords, regex_string = regex_string, handle_negations = handle_negations), pos_tagset = pos_tagset) for untokenized_message in message_list]
    else:
        return [split_message_into_tokens(untokenized_message, ngram = ngram, stopwords = stopwords, regex_string = regex_string, handle_negations = handle_negations) for untokenized_message in message_list]



In [54]:
## STEP 1: PREPROCESSING
df = pd.read_csv(FILENAME, delimiter = ',')
faulty_arrays = []
good_arrays = []

for i in range(len(df[[COLUMN_NAME]].values.tolist())):
    message = df[COLUMN_NAME].values.tolist()[i]
    if not is_ASCII_string(ascii_substituted(str(message))):
        faulty_arrays.append(i)
    else:
        df.set_value(i, COLUMN_NAME, ascii_substituted(str(message)))
        good_arrays.append(i)
        
df = df.iloc[good_arrays, :]

In [55]:
msg_list = get_message_list(df)
t = time.time()
# do stuff
tokenz = tokenize(msg_list)
print (time.time() - t)
tokenz

1.81709384918


[['good', 'information'],
 [u'feel', 'like', 'ad', 'credit', u'card', 'mentioned'],
 ['account',
  'manager',
  'american',
  'express',
  'not_explain',
  'closed',
  'account',
  'pending',
  'balance',
  'what',
  'happen',
  u'point',
  'earned',
  '?'],
 ['american',
  'express',
  'allowed',
  'carry',
  '14,000',
  'balance',
  'accrued',
  'interest',
  'late',
  'jacked',
  '29.9',
  'one',
  u'way',
  'paying',
  'transfer',
  '6500',
  'new',
  '0',
  'apr',
  'card',
  'maxed',
  'new',
  'credit',
  'line',
  '6500',
  'part',
  'pay',
  'year',
  'plan',
  'getting',
  'rid',
  'amex',
  'debt',
  'first',
  'paying',
  'new',
  'card',
  '0',
  'apr',
  'promotion',
  u'end',
  'however',
  'concerned',
  'maxed',
  'available',
  'credit',
  'hurting',
  'credit',
  'score',
  'amex',
  'never',
  'gave',
  'limit',
  'let',
  'pay',
  'time',
  'know',
  'answer',
  'please',
  'email',
  'jsagewalker',
  'gmail.com'],
 [],
 [],
 ['gave', 'rec', 'card', 'considering'],

In [47]:
def obtain_Message_Token_List(df, category, ngram = 1, split_by = 'page_vertical_tx', messageColumnName = "message", lang = "english", regexPattern = "^[^a-zA-Z0-9]+", pos_tagset = 'universal', specialNotToRemove = set([]), isNormalized = True, isLemmatized = True, stopWordsRemoved = True, handleNegations = True, puncRemoved = True, posTag = True):
    ## Step 1: get the messages perfectly, dealing with weird cases
    messageSet = get_message_set(df, category, split_by = 'page_vertical_tx', messageColumnName = "message")
    if len(messageSet) == 0:
        return []
    else:    
        ## Step 2: Tokenize and (optionally) normalize
        messageToken = tokenize_message_set(messageSet, isNormalized)
        ## Step 3: Remove stop words, punctuation
        ## Step 4: Lemmatize and N-gram (kind of with step 3 too)
        ## Step 5: POS-tag?
        if ngram == 1 and posTag:
            return [nltk.pos_tag(get_ngram_set(message, ngram, isLemmatized = isLemmatized, stopWordsRemoved = stopWordsRemoved, puncRemoved = puncRemoved, regexPattern = regexPattern, lang = lang, specialNotToRemove = specialNotToRemove, handleNegations = handleNegations), tagset = pos_tagset) for message in messageToken]
        else:
            return [get_ngram_set(message, ngram, isLemmatized = isLemmatized, stopWordsRemoved = stopWordsRemoved, puncRemoved = puncRemoved, regexPattern = regexPattern, lang = lang, specialNotToRemove = specialNotToRemove, handleNegations = handleNegations) for message in messageToken]

            
def get_message_set(df, category, split_by = 'page_vertical_tx', messageColumnName = "message"):
    if category != None:
        df = df.loc[df[split_by] == category]
    if len(df.shape) > 0 and df.shape[0] == 0:
        print("Category \'" + str(category) + "\' has no rows. Check spelling.")
        return []
    else:
        return [replaceString(str(message)) for messageL in df[[messageColumnName]].values.tolist() for message in messageL if isASCIIString(replaceString(str(message)))]            

def isASCIIString(string):
    try:
        string.decode('ascii')
        return True
    except:
        return False

# Just some ASCII coding issues
def replaceString(string):
    return string.replace("\\n", " ").replace("&amp;", "&").replace('&#039;', '\'').replace("&quot;", "\"").replace("&lt;", "<").replace("&gt;", ">").replace("\xe2\x80\x93", "-").replace("\xe2\x80\x99", "\'").strip()


In [12]:
STOPWORD_SET = (set(nltk.corpus.stopwords.words(LANGUAGE)) | set(['even'])) - set(['who', 'why', 'how', 'where', 'when', 'what', 'whom']) 

In [63]:
posts = df.groupby('post_id')
posts.indices

{155648: array([1911]),
 139265: array([1596]),
 163842: array([2167, 2168]),
 192516: array([2996]),
 90117: array([759, 760, 761, 762, 763, 764, 765, 766, 767]),
 96262: array([931]),
 120840: array([1244]),
 182281: array([2889]),
 63498: array([277, 278]),
 174092: array([2498]),
 262758: array([4740, 4741, 4742, 4743, 4744, 4745, 4746, 4747]),
 77838: array([554, 555]),
 221869: array([3584]),
 49168: array([114, 115]),
 59409: array([148]),
 100370: array([1033]),
 155667: array([1912, 1913]),
 210964: array([3410, 3411, 3412, 3413, 3414, 3415, 3416, 3417, 3418, 3419, 3420,
        3421]),
 151573: array([1798]),
 79894: array([624]),
 100377: array([1034]),
 213453: array([3466, 3467, 3468]),
 163867: array([2169, 2170, 2171]),
 203098: array([3276, 3277, 3278, 3279, 3280, 3281, 3282, 3283]),
 245791: array([4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266, 4267, 4268, 4269,
        4270, 4271]),
 188289: array([2941, 2942, 2943, 2944, 2945, 2946, 2947, 2948]),
 86704: array([707,