In [1]:
# The future
#from __future__ import print_function, division, absolute_import

# Data wrangling libraries
import pandas as pd
import numpy as np
import re
from io import StringIO

import matplotlib.pyplot as plt
import glob as glob
import pickle as pickle

# Numpy shorthand stuff
from numpy import array

# NLTK shorthand stuff
import nltk
nltk.download('punkt')
from nltk.tokenize import TweetTokenizer, RegexpTokenizer, sent_tokenize, word_tokenize

# SK-learn library for splitting data
from sklearn.model_selection import train_test_split


%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     /home/alexander_mpa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Borrowed some functions from the w266 utils.py file
# Miscellaneous helpers
def flatten(list_of_lists):
    """Flatten a list-of-lists into a single list."""
    return list(itertools.chain.from_iterable(list_of_lists))


# Word processing functions
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
        #word = re.sub(r"(DG)+", "DG", word)
    return word


def canonicalize_word(word, wordset=None, digits=True):
    #word = re.sub(r":","",word)
    #word = re.sub(r"https?","",word)
    #word = re.sub(r"\/","",word)
    #word = re.sub(r"@","",word)
    #word = re.sub(r"/\U0001.?'","",word)
    #replace hyperlinks with one instance of "postedhyperlinkvalue"
    word = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)*\/?\S*", "postedhyperlinkvalue", word)
    word = re.sub(r"(postedhyperlinkvalue)+", "postedhyperlinkvalue", word)
    #only lower case words (2 letters or longer) that are not all upper case
    if not word.isupper() or len(word) == 1:
        word = word.lower()
    #replace things like haha with ha
    word = re.sub(r"([a-z]{2,})\1{2,}", r"\1", word)
    #replace any three consecutive, identical letters with two instances of that letter
    word = re.sub(r"([a-z])\1{2,}", r"\1\1", word)
    #replace any two consecutive, identical consonants at the beginning of a string with one of that consonant
    word = re.sub(r"(^[^aeiou])\1{1,}", r"\1", word)
    
    #replace digits with a stand-in token
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

    
def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]


# Made some helper functions of our own
from nltk.stem import PorterStemmer   
def stem_sentence(token_sent, stemmer=PorterStemmer()):
    stem_token_sent = []
    for word in token_sent:
        stem_token_sent.append(stemmer.stem(word))
    return stem_token_sent


def sent_plus_word_tokenize(series):
    sentences = []
    words = []
    
    for comment in series:
        sentences.append(sent_tokenize(comment))
    
    flat_sentences = [item for sublist in sentences for item in sublist]
    
    for comment_sentence in flat_sentences:
        words.append(word_tokenize(comment_sentence))
    
    return sentences, words


def make_data(data, target='', commentfield='', tokenizer=TweetTokenizer(), canonize=True, stem=True):      
    # Separate comments
    comments = data.loc[:, commentfield]
    #comments = data.loc[:, 'comment_body']
    #labels = data.loc[:, target]
    
    # Convert to list
    comment_list = comments.values.tolist()
    
    # Tokenize comments
    tokenizer = tokenizer
    # A list of lists of tokenized sentences: word == string/token; sentence == list of string/tokens
    tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences_x = [tokenizer.tokenize(sentence) for sentence in comment_list]
    #tokenized_sentences = []
    #sentence = []
    #last_tok = ''
    #for comment in tokenized_sentences_x:
    #    for tok in comment:
    #        if last_tok in ('http', 'https',':','http:','https:','@'):
    #            tok = last_tok + tok
    #        if tok in ('http', 'https',':', '@'):
    #            last_tok = tok
    #        else:
    #            last_tok = ''
    #            sentence.append(tok)
    #    tokenized_sentences.append(sentence)
    
    if stem:
        # Stem words
        comments_stem = []
        for sentence in tokenized_sentences:
            x_tokens_stem = stem_sentence(token_sent=sentence, stemmer=PorterStemmer())
            comments_stem.append(x_tokens_stem)
        tokenized_sentences = comments_stem
    
    if canonize:
        # Canonize words
        comments_canon = []
        for sentence in tokenized_sentences:
            x_tokens_canon = canonicalize_words(sentence)
            comments_canon.append(x_tokens_canon)
        # A list of lists of scrubbed tokens; token == word, list == sentence
        tokenized_sentences = comments_canon
    
    x_tokens = tokenized_sentences  
    #return comments, x_tokens, labels
    return comments, x_tokens


def rawlist_to_xtokens(rawlist=['default arg'], vocab_list=[]):
    xtokens = []
    for rawstring in rawlist:
        xtoken = list(filter(lambda x: x in vocab_list, rawstring.split()))
        xtokens.append(xtoken)   
    return xtokens


def xtoken_to_raw(xtoken=['default','arg']):  
    raw_text_string = ' '.join(xtoken)
    return raw_text_string


def raw_to_xtoken(raw_string='default arg'):
    xtoken = raw_string.split()
    return xtoken


def model_diagnostics(model, data, labels, target_names, random=False, test_size=0.10, random_state=42):
    
    # Split into test and train
    # Designate random test_size% of data (rounded up to next obs) as test data
    if random:
        train_data, test_data, train_labels, test_labels = train_test_split(data, labels, 
                                                                            test_size=test_size, 
                                                                            random_state=random_state)
    # Designate last test_size% of data (rounded up to next obs) as test data 
    else:
        idx = round(test_size*len(data))
        test_data = data[-idx:]
        test_labels = labels[-idx:]
    
    
    pred_labels = model.predict_classes(test_data)
    
    print("Test data length is: ", len(test_data))
    print("Test label length is: ", len(test_labels))
    print("Pred label length is: ", len(pred_labels))
    
    confusionMatrix = metrics.confusion_matrix(test_labels, pred_labels)
    classificationReport = classification_report(test_labels, pred_labels, target_names=target_names)
    
    return confusionMatrix, classificationReport


# Function to aggregate all of the comments for a given subreddit(s)

# Data location example: reddit data for March 2018 downloaded to ~/parlancr/data/reddit/2018_03/
# File names: reddit_2018_03000000000000.csv - reddit_2018_03000000000047.csv

def export_subreddits(subs):
    
    selected_subreddits = pd.DataFrame()
    file_stem = './data/reddit/*/reddit_*.csv'
    
    for f in sorted(glob.glob(file_stem)):
        
        print('Loading comments from: ', f)
        partition_comments = pd.read_csv(f)
        selected_subreddits = selected_subreddits.append(partition_comments[partition_comments['subreddit'].isin(subs)], ignore_index = True)
        
    return selected_subreddits


def build_model_input(pandas_df, commentfield, post_length, sent_length, tokenizer=TweetTokenizer()):
    
    sentences, tokens = sent_plus_word_tokenize(pandas_df[commentfield].dropna().values)
    
    sents = [sents for line in sentences for sents in line if len(line) <= post_length]
    
    sents_pd = pd.DataFrame({commentfield:sents})
    
    comments, x_tokens = make_data(sents_pd, commentfield=commentfield, canonize=True, stem=False, tokenizer=tokenizer)
    
    tokens = [sent for sent in x_tokens if len(sent) <= sent_length]
    
    raw_list = list(map(xtoken_to_raw, tokens))
    
    pd_final = pd.DataFrame({commentfield:raw_list})
    
    #train, validate, test = np.split(pd_final.sample(frac=1), [int(.6*len(pd_final)), int(.8*len(pd_final))])
    
    #return train, validate, test, pd_final
    return pd_final

## Load Data & Transform

In [3]:

wiki_normal = pd.read_csv('data/normal.txt', sep='\t', header = None, names = ['article_title', 'paragraph_number', 'sentence_text']).dropna(subset=['sentence_text'])
wiki_simple = pd.read_csv('data/simple.txt', sep='\t', header = None, names = ['article_title', 'paragraph_number', 'sentence_text']).dropna(subset=['sentence_text'])

print(wiki_normal.head())
print(wiki_normal.count())
print(wiki_simple.head())
print(wiki_simple.count())

  article_title  paragraph_number  \
0         April                 0   
1         April                 0   
2         April                 0   
3         April                 0   
4         April                 0   

                                       sentence_text  
0  April is the fourth month of the year in the G...  
1  April was originally the second month of the R...  
2  It became the fourth month of the calendar yea...  
3  The derivation of the name -LRB- Latin Aprilis...  
4  The traditional etymology is from the Latin ap...  
article_title       3851440
paragraph_number    3851440
sentence_text       3851440
dtype: int64
  article_title  paragraph_number  \
0         April                 0   
1         April                 0   
2         April                 0   
3         April                 0   
4         April                 0   

                                       sentence_text  
0            April is the fourth month of the year .  
1                

In [13]:
sampled_articles = np.random.choice(wiki_normal['article_title'].unique(), size = 30000, replace = False)

wiki_normal_sampled = wiki_normal.loc[wiki_normal['article_title'].isin(sampled_articles)]
wiki_simple_sampled = wiki_simple.loc[wiki_simple['article_title'].isin(sampled_articles)]

print(wiki_normal_sampled.head())
print(wiki_normal_sampled.count())
print(wiki_simple_sampled.head())
print(wiki_simple_sampled.count())

   article_title  paragraph_number  \
29           Art                 0   
30           Art                 0   
31           Art                 0   
32           Art                 1   
33           Art                 1   

                                        sentence_text  
29  Art is the product or process of deliberately ...  
30  It encompasses a diverse range of human activi...  
31  The meaning of art is explored in a branch of ...  
32  Traditionally , the term art was used to refer...  
33  This conception changed during the Romantic pe...  
article_title       1929002
paragraph_number    1929002
sentence_text       1929002
dtype: int64
   article_title  paragraph_number  \
22           Art                 0   
23           Art                 0   
24           Art                 0   
25           Art                 0   
26           Art                 0   

                                        sentence_text  
22  The word art is used to describe some activiti...

In [15]:
wiki_normal_model_input = build_model_input(pandas_df=wiki_normal_sampled, commentfield='sentence_text', post_length=100, sent_length=1000)

print(wiki_normal_model_input.head())
print(wiki_normal_model_input.count())

                                       sentence_text
0  art is the product or process of deliberately ...
1  it encompasses a diverse range of human activi...
2  the meaning of art is explored in a branch of ...
3  traditionally , the term art was used to refer...
4  this conception changed during the romantic pe...
sentence_text    1969307
dtype: int64


In [16]:
wiki_simple_model_input = build_model_input(pandas_df=wiki_simple_sampled, commentfield='sentence_text', post_length=100, sent_length=1000)

print(wiki_simple_model_input.head())
print(wiki_simple_model_input.count())

                                       sentence_text
0  the word art is used to describe some activiti...
1  therefore , art is made when a human expresses...
2  some art is useful in a practical sense , such...
3        many people disagree on how to define art .
4  many people say people are driven to make art ...
sentence_text    257454
dtype: int64


In [17]:
wiki_normal_model_input.to_csv('data/model_data/wiki_normal.train.0', sep='\t', index=False, header=False)
wiki_simple_model_input.to_csv('data/model_data/wiki_simple.train.0', sep='\t', index=False, header=False)

In [19]:
# Secondary set of wikipedia data, filtered to well-matched sentences (https://github.com/senisioi/NeuralTextSimplification)

wiki2_normal = pd.read_csv('data/train_normal_matched.txt', sep='\t', header = None, names = ['sentence_text']).dropna(subset=['sentence_text'])
wiki2_simple = pd.read_csv('data/train_simple_matched.txt', sep='\t', header = None, names = ['sentence_text']).dropna(subset=['sentence_text'])

print(wiki2_normal.head())
print(wiki2_normal.count())
print(wiki2_simple.head())
print(wiki2_simple.count())

wiki2_normal_model_input = build_model_input(pandas_df=wiki2_normal, commentfield='sentence_text', post_length=100, sent_length=1000)

print(wiki2_normal_model_input.head())
print(wiki2_normal_model_input.count())

wiki2_simple_model_input = build_model_input(pandas_df=wiki2_simple, commentfield='sentence_text', post_length=100, sent_length=1000)

print(wiki2_simple_model_input.head())
print(wiki2_simple_model_input.count())

wiki2_normal_model_input.to_csv('data/model_data/wiki_normal_matched.train.0', sep='\t', index=False, header=False)
wiki2_simple_model_input.to_csv('data/model_data/wiki_simple_matched.train.0', sep='\t', index=False, header=False)

                                       sentence_text
0                                   He died in 999 .
1  Gingerbread was brought to Europe in 992 by th...
2  It was the custom to bake white biscuits and p...
3  During the 13th century , gingerbread was brou...
4  It then referred to a confection made with hon...
sentence_text    284677
dtype: int64
                                       sentence_text
0                                   He died in 999 .
1  Armenian monk Gregory of Nicopolis ( Gregory M...
2  The custom was to bake white biscuits and pain...
3  German immigrants brought it to Sweden during ...
4  After , it was a confection made with honey an...
sentence_text    284677
dtype: int64
                                       sentence_text
0                                    he died in DG .
1  gingerbread was brought to europe in DGDG by t...
2  it was the custom to bake white biscuits and p...
3  during the 13th century , gingerbread was brou...
4  it then referred to a 