In [1]:
# The future
#from __future__ import print_function, division, absolute_import

# Data wrangling libraries
import pandas as pd
import numpy as np
import re
from io import StringIO

import matplotlib.pyplot as plt
import glob as glob
import pickle as pickle

# Numpy shorthand stuff
from numpy import array

# NLTK shorthand stuff
import nltk
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split


%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     /home/chadharness/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Borrowed some functions from the w266 utils.py file
# Miscellaneous helpers
def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))


# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
        #word = re.sub(r"(DG)+", "DG", word)
    return word


def canonicalize_word(word, wordset=None, digits=True):
    #word = re.sub(r":","",word)
    #word = re.sub(r"https?","",word)
    #word = re.sub(r"\/","",word)
    #word = re.sub(r"@","",word)
    #word = re.sub(r"/\U0001.?'","",word)
    #replace hyperlinks with one instance of "postedhyperlinkvalue"
    word = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?\S*", "postedhyperlinkvalue", word)
    word = re.sub(r"(postedhyperlinkvalue)+", "postedhyperlinkvalue", word)
    #only lower case words (2 letters or longer) that are not all upper case
    if not word.isupper() or len(word) == 1:
        word = word.lower()
    #replace things like haha with ha
    word = re.sub(r"([a-z]{2,})\1{2,}", r"\1", word)
    #replace any three consecutive, identical letters with two instances of that letter
    word = re.sub(r"([a-z])\1{2,}", r"\1\1", word)
    #replace any two consecutive, identical consonants at the beginning of a string with one of that consonant
    word = re.sub(r"(^[^aeiou])\1{1,}", r"\1", word)
    
    #replace digits with a stand-in token
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

    
def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]


# Made some helper functions of our own
from nltk.stem import PorterStemmer   
def stem_sentence(token_sent, stemmer=PorterStemmer()):
    stem_token_sent = []
    for word in token_sent:
        stem_token_sent.append(stemmer.stem(word))
    return stem_token_sent


def sent_plus_word_tokenize(series):
    sentences = []
    words = []
    
    for comment in series:
        sentences.append(sent_tokenize(comment))
    
    flat_sentences = [item for sublist in sentences for item in sublist]
    
    for comment_sentence in flat_sentences:
        words.append(word_tokenize(comment_sentence))
    
    return sentences, words


def make_data(data, target='', commentfield='', tokenizer=TweetTokenizer(), canonize=True, stem=True):      
    # Separate comments
    comments = data.loc[:, commentfield]
    #comments = data.loc[:, 'comment_body']
    #labels = data.loc[:, target]
    
    # Convert to list
    comment_list = comments.values.tolist()
    
    # Tokenize comments
    tokenizer = tokenizer
    # A list of lists of tokenized sentences: word == string/token; sentence == list of string/tokens
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences_x = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences = []
    #sentence = []
    #last_tok = ''
    #for comment in tokenized_sentences_x:
    #    for tok in comment:
    #        if last_tok in ('http', 'https',':','http:','https:','@'):
    #            tok = last_tok + tok
    #        if tok in ('http', 'https',':', '@'):
    #            last_tok = tok
    #        else:
    #            last_tok = ''
    #            sentence.append(tok)
    #    tokenized_sentences.append(sentence)
    
    if stem:
        # Stem words
        comments_stem = []
        for sentence in tokenized_sentences:
            x_tokens_stem = stem_sentence(token_sent=sentence, stemmer=PorterStemmer())
            comments_stem.append(x_tokens_stem)
        tokenized_sentences = comments_stem
    
    if canonize:
        # Canonize words
        comments_canon = []
        for sentence in tokenized_sentences:
            x_tokens_canon = canonicalize_words(sentence)
            comments_canon.append(x_tokens_canon)
        # A list of lists of scrubbed tokens; token == word, list == sentence
        tokenized_sentences = comments_canon
    
    x_tokens = tokenized_sentences  
    #return comments, x_tokens, labels
    return comments, x_tokens


def rawlist_to_xtokens(rawlist=['default arg'], vocab_list=[]):
    xtokens = []
    for rawstring in rawlist:
        xtoken = list(filter(lambda x: x in vocab_list, rawstring.split()))
        xtokens.append(xtoken)   
    return xtokens


def xtoken_to_raw(xtoken=['default','arg']):  
    raw_text_string = ' '.join(xtoken)
    return raw_text_string


def raw_to_xtoken(raw_string='default arg'):
    xtoken = raw_string.split()
    return xtoken


def model_diagnostics(model, data, labels, target_names, random=False, test_size=0.10, random_state=42):
    
    # Split into test and train
    # Designate random test_size% of data (rounded up to next obs) as test data
    if random:
        train_data, test_data, train_labels, test_labels = train_test_split(data, labels, 
                                                                            test_size=test_size, 
                                                                            random_state=random_state)
    # Designate last test_size% of data (rounded up to next obs) as test data 
    else:
        idx = round(test_size*len(data))
        test_data = data[-idx:]
        test_labels = labels[-idx:]
    
    
    pred_labels = model.predict_classes(test_data)
    
    print("Test data length is: ", len(test_data))
    print("Test label length is: ", len(test_labels))
    print("Pred label length is: ", len(pred_labels))
    
    confusionMatrix = metrics.confusion_matrix(test_labels, pred_labels)
    classificationReport = classification_report(test_labels, pred_labels, target_names=target_names)
    
    return confusionMatrix, classificationReport


# Function to aggregate all of the comments for a given subreddit(s)

# Data location example: reddit data for March 2018 downloaded to ~/parlancr/data/reddit/2018_03/
# File names: reddit_2018_03000000000000.csv - reddit_2018_03000000000047.csv

def export_subreddits(subs):
    
    selected_subreddits = pd.DataFrame()
    file_stem = './data/reddit/*/reddit_*.csv'
    
    for f in sorted(glob.glob(file_stem)):
        
        print('Loading comments from: ', f)
        partition_comments = pd.read_csv(f)
        selected_subreddits = selected_subreddits.append(partition_comments[partition_comments['subreddit'].isin(subs)], ignore_index = True)
        
    return selected_subreddits


def build_model_input(pandas_df, commentfield, post_length, sent_length, tokenizer=TweetTokenizer()):
    
    sentences, tokens = sent_plus_word_tokenize(pandas_df[commentfield].dropna().values)
    
    sents = [sents for line in sentences for sents in line if len(line) <= post_length]
    
    sents_pd = pd.DataFrame({commentfield:sents})
    
    comments, x_tokens = make_data(sents_pd, commentfield=commentfield, canonize=True, stem=False, tokenizer=tokenizer)
    
    tokens = [sent for sent in x_tokens if len(sent) <= sent_length]
    
    raw_list = list(map(xtoken_to_raw, tokens))
    
    pd_final = pd.DataFrame({commentfield:raw_list})
    
    #train, validate, test = np.split(pd_final.sample(frac=1), [int(.6*len(pd_final)), int(.8*len(pd_final))])
    
    #return train, validate, test, pd_final
    return pd_final

## Load Data & Transform

In [3]:
!pwd

/home/chadharness/w210/Parlancr/data_pipeline


In [4]:
shakespeare_modern = pd.read_csv('../../Shakespearizing-Modern-English/data/test.modern.nltktok', sep="\n", header=None)

In [5]:
shakespeare_modern.head()

Unnamed: 0,0
0,A jumbled confession can only receive a jumble...
1,I love rich Capulet's daughter .
2,We're bound to each other in every possible wa...
3,I'll tell you more later about when and where ...
4,"Holy Saint Francis , this is a drastic change !"


In [6]:
shakespeare_modern.count()

0    1462
dtype: int64

In [7]:
mod_comments, x_tokens = make_data(shakespeare_modern, commentfield=0, canonize=True, stem=False, tokenizer=TweetTokenizer())
    
tokens = [sent for sent in x_tokens if len(sent) <= 112]
    
raw_list = list(map(xtoken_to_raw, tokens))
    
mod_final = pd.DataFrame({0:raw_list})

In [8]:
mod_final.head()

Unnamed: 0,0
0,a jumbled confession can only receive a jumble...
1,i love rich capulet's daughter .
2,we're bound to each other in every possible wa...
3,i'll tell you more later about when and where ...
4,"holy saint francis , this is a drastic change !"


In [9]:
mod_final.count()

0    1462
dtype: int64

In [133]:
mod_pd = build_model_input(pandas_df=shakespeare_modern, commentfield=0, post_length=10, sent_length=112)

In [134]:
mod_pd.head()

Unnamed: 0,0
0,"now , you lie there on the path ."
1,"she said if she were interested in someone , i..."
2,"besides , she treats me more respectfully than..."
3,what's the obvious conclusion from that ?
4,"just think , i could be count malvolio !"


In [120]:
mod_pd.count()

0    1484
dtype: int64

In [94]:
shakespeare_original = pd.read_csv('../../Shakespearizing-Modern-English/data/test.original.nltktok', sep="\n", header=None)

In [95]:
shakespeare_original.head()

Unnamed: 0,0
0,"Lie thou there ( throwing down a letter ) , fo..."
1,"Maria once told me she did affect me , and I h..."
2,"Besides , she uses me with a more exalted resp..."
3,What should I think on 't ?
4,To be Count Malvolio !


In [96]:
shakespeare_original.count()

0    1218
dtype: int64

In [97]:
orig_comments, x_tokens = make_data(shakespeare_original, commentfield=0, canonize=True, stem=False, tokenizer=TweetTokenizer())
    
tokens = [sent for sent in x_tokens if len(sent) <= 156]
    
raw_list = list(map(xtoken_to_raw, tokens))
    
orig_final = pd.DataFrame({0:raw_list})

In [98]:
orig_final.head()

Unnamed: 0,0
0,"lie thou there ( throwing down a letter ) , fo..."
1,"maria once told me she did affect me , and i h..."
2,"besides , she uses me with a more exalted resp..."
3,what should i think on ' t ?
4,to be count malvolio !


In [99]:
orig_final.count()

0    1218
dtype: int64

In [138]:
orig_pd = build_model_input(pandas_df=shakespeare_original, commentfield=0, post_length=10, sent_length=112)

In [139]:
orig_pd.head()

Unnamed: 0,0
0,"lie thou there ( throwing down a letter ) , fo..."
1,"maria once told me she did affect me , and i h..."
2,"besides , she uses me with a more exalted resp..."
3,what should i think on ' t ?
4,to be count malvolio !


In [140]:
orig_pd.count()

0    1251
dtype: int64

In [141]:
orig_pd.iloc[[26]]

Unnamed: 0,0
26,"be you his eunuch , and your mute i'll be ."


In [142]:
mod_pd.iloc[[26]]

Unnamed: 0,0
26,"you can be a eunuch , but i'll be mute ."


In [48]:
mod_final.to_csv('../../data/shakes_mod.train.0', sep='\t', index=False, header=False)

In [100]:
orig_final.to_csv('../../data/shakes_mod.valid.1', sep='\t', index=False, header=False)