## Using Grammar and Word Lists

It would be nice to clean up a dataset which contains grammatically incorrect language and make it seem more professional. 

To do this we will experiment with numerous open source libraries to determine their efficacy. 

We also want to see if we can replace misspelled words and expand contrations into their correct forms.

Another eventual task will be to determine the 'quality' of a given sentence.

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from pprint import pprint

%matplotlib inline

### Gather Data Files

In [2]:
DATA_ROOT = '/home/brandon/terabyte/Datasets/reddit'
# Determine if this directory exists, if not use Ivan's directory.
if (os.path.isdir(DATA_ROOT)):
    pass
else:
    DATA_ROOT = '/Users/ivan/Documents/sp_17/reddit_data'
DATA_YEARS = ['2007'] #$, '2008']
# Use os.path.join; it will figure out the '/' in between.
RAW_DATA_FILES = [os.listdir(os.path.join(DATA_ROOT, 'raw_data', year)) for year in DATA_YEARS]

RAW_DATA_ABS_FILES = []
# Always work with full pathnames to be safe.
for i in range(len(DATA_YEARS)):
    for j in range(len(RAW_DATA_FILES[i])):
        if RAW_DATA_FILES[i][j].startswith('.'):
            pass
        else:
            RAW_DATA_ABS_FILES.append( os.path.join(DATA_ROOT, 'raw_data' , DATA_YEARS[i], RAW_DATA_FILES[i][j]))
RAW_DATA_FILES = RAW_DATA_ABS_FILES
pprint(RAW_DATA_FILES)

['/home/brandon/terabyte/Datasets/reddit/raw_data/2007/RC_2007-10',
 '/home/brandon/terabyte/Datasets/reddit/raw_data/2007/RC_2007-11',
 '/home/brandon/terabyte/Datasets/reddit/raw_data/2007/RC_2007-12']


### Load Data

In [3]:
def load_data():
    df = pd.read_json(RAW_DATA_FILES[0], lines=True)
    init_num_rows = len(df)
    print("Number of lines in raw data file", init_num_rows)
    pprint("Column names from raw data file:")
    pprint(df.columns)
    return df

In [4]:
def load_data():
    print(RAW_DATA_FILES[0])
    df = pd.read_json(RAW_DATA_FILES[0], lines=True)
    for i in range(len(RAW_DATA_FILES) - 1):
        df = df.append(pd.read_json(RAW_DATA_FILES[i+1], lines=True), ignore_index=True)
    init_num_rows = len(df)
    print("Number of lines in raw data file", init_num_rows)
    pprint("Column names from raw data file:")
    pprint(df.columns)
    return df

In [5]:
df = load_data()

/home/brandon/terabyte/Datasets/reddit/raw_data/2007/RC_2007-10
Number of lines in raw data file 886802
'Column names from raw data file:'
Index(['archived', 'author', 'author_flair_css_class', 'author_flair_text',
       'body', 'controversiality', 'created_utc', 'distinguished', 'downs',
       'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id',
       'retrieved_on', 'score', 'score_hidden', 'subreddit', 'subreddit_id',
       'ups'],
      dtype='object')


In [6]:
def show_len_update(df):
    print("Now there are", len(df), "rows.")
    
def root_comments(df):
    '''Build list determining which rows of df are root comments.
    
    Returns: 
        list of length equal to the number of rows in our data frame. 
    '''
    root_value = []
    # Iterate over DataFrame rows as namedtuples, with index value as first element of the tuple.
    for row in df.itertuples():
        root_value.append(row.parent_id == row.link_id)
    return root_value

def random_rows_generator(num_rows_per_print, num_rows_total):
    num_iterations = num_rows_total // num_rows_per_print 
    shuffled_indices = np.arange(num_rows_per_print * num_iterations)
    np.random.shuffle(shuffled_indices)
    for batch in shuffled_indices.reshape(num_iterations, num_rows_per_print):
        yield batch
        
#rand_rows = random_rows_generator(4, len(df))

### Initial Clean up

* Start by removing comments without a body (deleted).
* Remove comments larger than 150 words long.
* Remove unneccesary columns. 
* Add a column determining whether a row is a root comment.

In [7]:
def initial_clean(df):
    df['root'] = root_comments(df)
    df = df[['author', 'body', 'link_id', 'parent_id', 'name', 'root', 'subreddit']]
    df.style.set_properties(subset=['body'], **{'width': '500px'})
    df.style.set_properties(**{'text-align': 'left'})
    show_len_update(df)
    df.head()
    return df

In [8]:
df = initial_clean(df)

Now there are 886802 rows.


In [9]:
df.head()

Unnamed: 0,author,body,link_id,parent_id,name,root,subreddit
0,bostich,test,t3_5yba3,t3_5yba3,t1_c0299an,True,reddit.com
1,igiveyoumylife,much smoother.\r\n\r\nIm just glad reddit is b...,t3_5yba3,t3_5yba3,t1_c0299ao,True,reddit.com
2,Arve,"Can we please deprecate the word ""Ajax"" now? \...",t3_5yba3,t1_c02999p,t1_c0299ap,False,reddit.com
3,[deleted],[deleted],t3_5yba3,t3_5yba3,t1_c0299aq,True,reddit.com
4,gigaquack,"Oh, I see. Fancy schmancy ""submitting....""",t3_5yba3,t1_c0299ah,t1_c0299ar,False,reddit.com


In [10]:
modify_list = [('\r\n', ' '),
               ('\n', ' '),
               ('\r', ' '),
               ('&gt;', ' '),
               ('&lt;', ' '),
               ('/__|\*|\#|(?:\[([^\]]*)\]\([^)]*\))/gm', '[link]'),
               ('https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,}', '[link]'),
               ('\d+', 'NUMBER'),
               ('\[', ''),
               ('\]', ''),
               ('\/\/', ''),
               ('\.\.\.', '. ')
              ]

modify_value = {'\r\n': 1,
               '\n': 1,
               '\r': 1,
               '&gt;': 10,
               '&lt;': 10,
               '/__|\*|\#|(?:\[([^\]]*)\]\([^)]*\))/gm': 100,
               'https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,}': 100,
               '\d+': 1000,
               '\[': 10000,
               '\]': 10000,
               '\/\/': 10000,
               '\.\.\.': 100000
              }

In [11]:
def clean_with_tracking(df):
    df = df.loc[df.body != '[deleted]'].reset_index(drop=True)
    df.style.set_properties(subset=['body'], **{'width': '800px'})
    df['body'] = df['body'].map(lambda s: s.strip().lower())
    
    total_mods = {}
    if 'mods' not in df: 
        df['mods'] = np.zeros(len(df['body']), dtype=int)
    for patrn in modify_list:
        new_df = df['body'].replace({patrn[0]: patrn[1]}, regex=True, inplace=False)
        modifications = list((np.where(new_df.values != df['body'].values))[0])
        df['body'] = new_df
        df['mods'][modifications] += modify_value[patrn[0]]
        total_mods[patrn[0]] = len(modifications)
    return df, total_mods

In [12]:
df,total_mods = clean_with_tracking(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
df.tail()

Unnamed: 0,author,body,link_id,parent_id,name,root,subreddit,mods
707889,rowd149,so. so dinotopia linkcouldlink exist? :d,t3_6482i,t1_c02s8g5,t1_c02s9s1,False,reddit.com,120100
707890,mlietzen,don't you wish americans cared this much about...,t3_648hg,t3_648hg,t1_c02s9s2,True,politics,0
707891,bbqribs,the illegals around here said in interviews th...,t3_647mf,t1_c02s8lb,t1_c02s9s3,False,reddit.com,1000
707892,joyork,for the first time ever on nye i'm alone and s...,t3_648on,t3_648on,t1_c02s9s4,True,reddit.com,1
707893,boredzo,i recently discovered that this doesn't work...,t3_648cg,t3_648cg,t1_c02s9s5,True,programming,21111


## pyEnchant is used to check if this is a real word.

* An issue with this apporach is words that are not english, but are used heavily (e.g. 'reddit')

In [14]:
import enchant
d = enchant.Dict("en_US")

In [15]:
[(d.check(word), word) for word in df.body[1].split()]

[(True, 'much'),
 (True, 'smoother.'),
 (False, 'im'),
 (True, 'just'),
 (True, 'glad'),
 (False, 'reddit'),
 (True, 'is'),
 (False, 'back,'),
 (False, 'linkreddit'),
 (True, 'in'),
 (False, 'mirc'),
 (True, 'was'),
 (True, 'entertaining'),
 (True, 'but'),
 (True, 'i'),
 (True, 'had'),
 (True, 'no'),
 (True, 'idea'),
 (True, 'how'),
 (True, 'addicted'),
 (True, 'i'),
 (True, 'had'),
 (True, 'become.'),
 (True, 'thanks'),
 (True, 'for'),
 (True, 'making'),
 (True, 'the'),
 (True, 'detox'),
 (True, 'somewhat'),
 (True, 'short.')]

In [16]:
import language_check

ImportError: No module named 'language_check'

In [None]:
tool = language_check.LanguageTool('en_US')

In [None]:
matches = tool.check(df.body[1])

In [None]:
tool.disabled.add("UPPERCASE_SENTENCE_START")
tool.disabled.add('I_LOWERCASE')

In [None]:
matches = tool.check(df.body[1])

### Remove comments with more than n words

In [17]:
def remove_large_comments(n, df):
    print("Length before:", df['body'].size)
    df = df[df['body'].map(lambda s: len(s.split(' '))) < n].reset_index(drop=True)
    show_len_update(df)
    return df

### Using the tokenizer from io_utils


In [18]:
import re
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE   = re.compile(r"\d")

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))
    return [w for w in words if w]

In [19]:
# 
df['body'] = [basic_tokenizer(sentence) for sentence in df['body']]

In [20]:
df['body']

0                                                    [test]
1         [much, smoother, ., im, just, glad, reddit, is...
2         [can, we, please, deprecate, the, word, ", aja...
3         [oh, ,, i, see, ., fancy, schmancy, ", submitt...
4                                              [testing, .]
5                      [i, like, it, ., one, more, time, .]
6         [try, refreshing, yor, cache, ,, that, worked,...
7                    [k, ., i, lied, ., just, one, more, .]
8         [i, also, wonder, what, the, differences, are, .]
9                                        [so, addictive, .]
10        [i, can, ', t, post, a, story, to, proggit, -,...
11                              [alright, i, ', m, done, .]
12        [is, anyone, else, ', s, ", recommended, ", pa...
13        [ok, ,, i, guess, we, need, to, submit, commen...
14        [i, can, ', t, submit, any, stories, --, even,...
15                [working, fine, with, normal, adblock, .]
16        [can, ', t, see, beta, ., redd

In [21]:
a= [word for sentence in df['body'].values for word in sentence]
a

['test',
 'much',
 'smoother',
 '.',
 'im',
 'just',
 'glad',
 'reddit',
 'is',
 'back',
 ',',
 'linkreddit',
 'in',
 'mirc',
 'was',
 'entertaining',
 'but',
 'i',
 'had',
 'no',
 'idea',
 'how',
 'addicted',
 'i',
 'had',
 'become',
 '.',
 'thanks',
 'for',
 'making',
 'the',
 'detox',
 'somewhat',
 'short',
 '.',
 'can',
 'we',
 'please',
 'deprecate',
 'the',
 'word',
 '"',
 'ajax',
 '"',
 'now',
 '?',
 '(',
 'but',
 'yeah',
 ',',
 'this',
 '_is_',
 'much',
 'nicer',
 ')',
 'oh',
 ',',
 'i',
 'see',
 '.',
 'fancy',
 'schmancy',
 '"',
 'submitting',
 '.',
 '.',
 '"',
 'testing',
 '.',
 'i',
 'like',
 'it',
 '.',
 'one',
 'more',
 'time',
 '.',
 'try',
 'refreshing',
 'yor',
 'cache',
 ',',
 'that',
 'worked',
 'for',
 'me',
 'edit',
 ':',
 'trying',
 'to',
 'edit',
 'k',
 '.',
 'i',
 'lied',
 '.',
 'just',
 'one',
 'more',
 '.',
 'i',
 'also',
 'wonder',
 'what',
 'the',
 'differences',
 'are',
 '.',
 'so',
 'addictive',
 '.',
 'i',
 'can',
 "'",
 't',
 'post',
 'a',
 'story',
 'to'

In [22]:
#source : http://stackoverflow.com/questions/33093809/count-the-frequency-of-elements-in-list-of-lists-in-python/33093930
from itertools import chain
from collections import Counter
a= [word for sentence in df['body'].values for word in sentence]
word_freq = Counter(chain(a))
sorted_word_freq = sorted(word_freq.items(), key=lambda x: -x[1])

### Number of words used

In [23]:
sum([value for key, value in word_freq.items()])

32140520

### Number of 'valid' words

In [24]:
sum([value for key, value in word_freq.items() if d.check(key)])

27477766

In [25]:
show_len_update(df)

Now there are 707894 rows.


### We want to know how many sentences we have if we remove all senteces with invalid words.

In [26]:
def invalid_word(df):
    '''Goes through the content and determines whether an invalid word is 
    present.
    
    The data frame should provide a body field which will be inspected.
    '''
    d = enchant.Dict("en_US") 
    valid_sentences = [True] * len(df)
    misspelled_words = {}
     
    for idx, sentence in enumerate(df['body'].values):
        for word in sentence:
            if not d.check(word):
                if word in misspelled_words:
                    misspelled_words[word] += 1
                else:
                    misspelled_words[word] = 1
                valid_sentences[idx] = False
    print("There are %i valid sentences out of %i." % (sum(valid_sentences), len(valid_sentences)))
    print("There are %i misspelled words." % len(misspelled_words))
    return valid_sentences, misspelled_words

In [27]:
valid_sent, misspelled = invalid_word(df)

There are 69873 valid sentences out of 707894.
There are 187203 misspelled words.


In [28]:
print('Total number of valid sentences using basic english dictionary:')
print(sum(valid_sent))


Total number of valid sentences using basic english dictionary:
69873


In [29]:
df['spell'] = valid_sent

In [30]:
# We use this list to append some words to our dictionary.
sorted_mispelled_words = sorted(misspelled.items(), key=lambda x: -x[1])

### We write to a text-file the 1000 most common misspelled words

In [31]:
MISSPELLED_WORDS = '/Users/ivan/Desktop/mywords.txt'
f = open(MISSPELLED_WORDS, 'w')
for word in sorted_mispelled_words[:1000]:
    f.write(word[0] + "\n")
f.close()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ivan/Desktop/mywords.txt'

In [None]:
sorted_mispelled_words[:5000]

### Try and replace contractions

In [None]:
def reset(df):
    df = load_data()
    df = initial_clean(df)
    df,total_mods = clean_with_tracking(df)
    df = remove_large_comments(60, df)
    df = contraction_replacer(df)
    return df

In [None]:
-

In [None]:
def contraction_replacer(df):
    for patrn in contractions.items():
        df['body'].replace({patrn[0]: patrn[1]}, regex=True, inplace=True)    
    return df

In [None]:
df = contraction_replacer(df)

In [None]:
df['body'] = [basic_tokenizer(sentence) for sentence in df['body']]

In [None]:
show_len_update(df)

In [None]:
valid_sentences, misspelled_words = invalid_word(df)

In [None]:
sorted_misspelled_words = sorted(misspelled_words.items(), key=lambda x: -x[1])

In [None]:
sorted_misspelled_words[]

In [None]:
MISSPELLED_WORDS = '/Users/ivan/Desktop/mywords.txt'
f = open(MISSPELLED_WORDS, 'w')
for word in sorted_mispelled_words[:10000]:
    f.write(word[0] + "\n")
f.close()

In [None]:
def invalid_word_modified(df):
    '''Goes through the content and determines whether an invalid word is 
    present.
    
    The data frame should provide a body field which will be inspected.
    '''
    d = enchant.DictWithPWL("en_US", MISSPELLED_WORDS)
    valid_sentences = [True] * len(df)
    misspelled_words = {}
     
    for idx, sentence in enumerate(df['body'].values):
        for word in sentence:
            if not d.check(word):
                if word in misspelled_words:
                    misspelled_words[word] += 1
                else:
                    misspelled_words[word] = 1
                valid_sentences[idx] = False
    print("There are %i valid sentences out of %i." % (sum(valid_sentences), len(valid_sentences)))
    print("There are %i misspelled words." % len(misspelled_words))
    return valid_sentences, misspelled_words

In [None]:
valid_sentences, misspelled_words = invalid_word_modified(df)
sorted_misspelled_words = sorted(misspelled_words.items(), key=lambda x: -x[1])

##### By adding 10000 extra words (not in the original dictionary) we only see 5000 more valid sentences

* Define a threshold which keeps sentences with many of the misspelled words
* For words that are in the original dictionary add 0 points. 
* Words that are not in the original dictionary add the inverse of the number of occurences in the corpus
* We then normalize to the length of the sentence.

In [None]:
df = reset(df)

In [None]:
sentences = [basic_tokenizer(sentence) for sentence in df['body']]
words= [word for sentence in sentences for word in sentence]

In [None]:
word_freq = Counter(chain(words))
sorted_word_freq = sorted(word_freq.items(), key=lambda x: -x[1])

In [None]:
def sentence_score(sentence):
    d = enchant.Dict('en_US')
    word_count = len(sentence)
    score = 0
    for word in sentence:
        if not d.check(word):
            try: 
                score = score + 1.0/word_freq[word]
            except ZeroDivisionError:
                score = score + 1.0
    try:
        return score / word_count
    except ZeroDivisionError:
        return 1

In [None]:
def add_sentence_scores(df):
    scores = []
    pbar = ProgressBar()
    for sentence in pbar(df.body):
        scores.append(sentence_score(basic_tokenizer(sentence)))
    df['score'] = scores

In [None]:
from progressbar import ProgressBar

In [None]:
add_sentence_scores(df)

In [None]:
# A plot which displays the distribution of "penalty score" of a sentence. 
plt.hist(df.score.values, bins=500)
plt.xlim(0, 0.2)

In [None]:
len(df.loc[df.score < 0.005])

In [None]:
def load_data():
    df = pd.read_json(RAW_DATA_FILES[0], lines=True)
    df2 = pd.read_json(RAW_DATA_FILES[1], lines=True)
    df3 = pd.read_json(RAW_DATA_FILES[2], lines=True)
    df = df.append(df2, ignore_index=True)
    df = df.append(df3, ignore_index=True)
    df = df.reset_index(drop=True)
    init_num_rows = len(df)
    print("Number of lines in raw data file", init_num_rows)
    pprint("Column names from raw data file:")
    pprint(df.columns)
    return df

In [None]:
df = reset(df)

In [None]:
len(df)

In [None]:
sentences = [basic_tokenizer(sentence) for sentence in df['body']]
words= [word for sentence in sentences for word in sentence]

In [None]:
word_freq = Counter(chain(words))
sorted_word_freq = sorted(word_freq.items(), key=lambda x: -x[1])

In [None]:
sorted_word_freq[:100]

In [None]:
add_sentence_scores(df)

In [None]:
df.head()

In [None]:
plt.hist(df.score.values, bins=500)
plt.xlim(0, 0.2)

In [None]:
df = df.loc[df.score < 0.005]

In [None]:
## Returns a dictionary with keys being the root comments and values being their immediate children.
## Assumes to have a 'root' column already

## Go through all comments, if it is a root skip it since they wont have a parent_id corresponding
## to a comment.
## 
def children_dict(df):
    children = {}
    for row in df.itertuples():
        if row.root == False:
            if row.parent_id in children.keys():
                children[row.parent_id].append(row.name)
            else:
                children[row.parent_id] = [row.name]
    return children

In [None]:
## Return a dictionary with name being the key and body being the value. 
values_dict = pd.Series(df.body.values, index=df.name).to_dict()

In [None]:
children = children_dict(df)

In [None]:
## Generates two files, [from_file_path] and [to_file_path] of one-to-one comments. 
def generate_files(from_file_path, to_file_path):
    ## Open the files and clear them. 
    from_file = open(from_file_path, 'w')
    to_file = open(to_file_path, 'w')
    from_file.write("")
    to_file.write("")
    from_file.close()
    to_file.close()

    for key in children.keys():
        from_file = open(from_file_path, 'a')
        to_file = open(to_file_path, 'a')

        ## Since we have deleted comments, some comments parents might not exist anymore so we must catch that error.
        for child in children[key]:
            try: 
                from_file.write(values_dict[key].replace('\n', '').replace('\r', ' ').replace('&gt', '') + "\n")
                to_file.write(values_dict[child].replace('\n', '').replace('\r', ' ').replace('&gt', '') + "\n")
            except KeyError:    
                pass
    from_file.close()
    to_file.close()

In [None]:
generate_files("from_file.txt", "to_file.txt")

In [None]:
'dickbutt'