## Using Grammar and Word Lists

It would be nice to clean up a dataset which contains grammatically incorrect language and make it seem more professional. 

To do this we will experiment with numerous open source libraries to determine their efficacy. 

We also want to see if we can replace misspelled words and expand contrations into their correct forms.

Another eventual task will be to determine the 'quality' of a given sentence.

In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from pprint import pprint

%matplotlib inline

### Gather Data Files

In [20]:
DATA_ROOT = '/home/brandon/terabyte/Datasets/reddit_data' 
# Determine if this directory exists, if not use Ivan's directory.
if (os.path.isdir(DATA_ROOT)):
    pass
else:
    DATA_ROOT = '/Users/ivan/Documents/sp_17/reddit_data'
DATA_YEAR = '2007'
# Use os.path.join; it will figure out the '/' in between.
RAW_DATA_FILES = os.listdir(os.path.join(DATA_ROOT, 'raw_data', DATA_YEAR))
# Always work with full pathnames to be safe.
RAW_DATA_FILES = [os.path.join(DATA_ROOT, 'raw_data', DATA_YEAR, file) for file in RAW_DATA_FILES 
                  if not file.startswith('.')]
pprint(RAW_DATA_FILES)

['/Users/ivan/Documents/sp_17/reddit_data/raw_data/2007/RC_2007-10.json',
 '/Users/ivan/Documents/sp_17/reddit_data/raw_data/2007/RC_2007-11.json',
 '/Users/ivan/Documents/sp_17/reddit_data/raw_data/2007/RC_2007-12.json']


### Load Data

In [21]:
def load_data():
    df = pd.read_json(RAW_DATA_FILES[0], lines=True)
    init_num_rows = len(df)
    print("Number of lines in raw data file", init_num_rows)
    pprint("Column names from raw data file:")
    pprint(df.columns)
    return df

In [22]:
df = load_data()

Number of lines in raw data file 150429
'Column names from raw data file:'
Index(['archived', 'author', 'author_flair_css_class', 'author_flair_text',
       'body', 'controversiality', 'created_utc', 'distinguished', 'downs',
       'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id',
       'retrieved_on', 'score', 'score_hidden', 'subreddit', 'subreddit_id',
       'ups'],
      dtype='object')


In [23]:
def show_len_update(df):
    print("Now there are", len(df), "rows.")
    
def root_comments(df):
    '''Build list determining which rows of df are root comments.
    
    Returns: 
        list of length equal to the number of rows in our data frame. 
    '''
    root_value = []
    # Iterate over DataFrame rows as namedtuples, with index value as first element of the tuple.
    for row in df.itertuples():
        root_value.append(row.parent_id == row.link_id)
    return root_value

def random_rows_generator(num_rows_per_print, num_rows_total):
    num_iterations = num_rows_total // num_rows_per_print 
    shuffled_indices = np.arange(num_rows_per_print * num_iterations)
    np.random.shuffle(shuffled_indices)
    for batch in shuffled_indices.reshape(num_iterations, num_rows_per_print):
        yield batch
        
#rand_rows = random_rows_generator(4, len(df))

### Initial Clean up

* Start by removing comments without a body (deleted).
* Remove comments larger than 150 words long.
* Remove unneccesary columns. 
* Add a column determining whether a row is a root comment.

In [24]:
def initial_clean(df):
    df['root'] = root_comments(df)
    df = df[['author', 'body', 'link_id', 'parent_id', 'name', 'root', 'subreddit']]
    df.style.set_properties(subset=['body'], **{'width': '500px'})
    df.style.set_properties(**{'text-align': 'left'})
    show_len_update(df)
    df.head()
    return df

In [25]:
df = initial_clean(df)

Now there are 150429 rows.


In [26]:
df.head()

Unnamed: 0,author,body,link_id,parent_id,name,root,subreddit
0,bostich,test,t3_5yba3,t3_5yba3,t1_c0299an,True,reddit.com
1,igiveyoumylife,much smoother.\r\n\r\nIm just glad reddit is b...,t3_5yba3,t3_5yba3,t1_c0299ao,True,reddit.com
2,Arve,"Can we please deprecate the word ""Ajax"" now? \...",t3_5yba3,t1_c02999p,t1_c0299ap,False,reddit.com
3,[deleted],[deleted],t3_5yba3,t3_5yba3,t1_c0299aq,True,reddit.com
4,gigaquack,"Oh, I see. Fancy schmancy ""submitting....""",t3_5yba3,t1_c0299ah,t1_c0299ar,False,reddit.com


In [27]:
modify_list = [('\r\n', ' '),
               ('\n', ' '),
               ('\r', ' '),
               ('&gt;', ' '),
               ('&lt;', ' '),
               ('/__|\*|\#|(?:\[([^\]]*)\]\([^)]*\))/gm', '[link]'),
               ('https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,}', '[link]'),
               ('\d+', 'NUMBER'),
               ('\[', ''),
               ('\]', ''),
               ('\/\/', ''),
               ('\.\.\.', '. ')
              ]

modify_value = {'\r\n': 1,
               '\n': 1,
               '\r': 1,
               '&gt;': 10,
               '&lt;': 10,
               '/__|\*|\#|(?:\[([^\]]*)\]\([^)]*\))/gm': 100,
               'https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,}': 100,
               '\d+': 1000,
               '\[': 10000,
               '\]': 10000,
               '\/\/': 10000,
               '\.\.\.': 100000
              }

In [28]:
def clean_with_tracking(df):
    df = df.loc[df.body != '[deleted]'].reset_index(drop=True)
    df.style.set_properties(subset=['body'], **{'width': '800px'})
    df['body'] = df['body'].map(lambda s: s.strip().lower())
    
    total_mods = {}
    if 'mods' not in df: 
        df['mods'] = np.zeros(len(df['body']), dtype=int)
    for patrn in modify_list:
        new_df = df['body'].replace({patrn[0]: patrn[1]}, regex=True, inplace=False)
        modifications = list((np.where(new_df.values != df['body'].values))[0])
        df['body'] = new_df
        df['mods'][modifications] += modify_value[patrn[0]]
        total_mods[patrn[0]] = len(modifications)
    return df, total_mods

In [29]:
df,total_mods = clean_with_tracking(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
df.tail()

Unnamed: 0,author,body,link_id,parent_id,name,root,subreddit,mods
126315,folderol,muscle memory. or he never knew how to run a...,t3_5zj73,t1_c02cfl4,t1_c02cheo,False,reddit.com,100001
126316,michaelco,man develops delusions wow thats a headline,t3_5zjx2,t3_5zjx2,t1_c02chep,True,reddit.com,0
126317,aletoledo,i have a list of redditers i wish to have cens...,t3_5zk1h,t1_c02che2,t1_c02cher,False,reddit.com,1
126318,Dark-Star,nope. too lazy and don't care for until they o...,t3_5zimk,t1_c02cebw,t1_c02ches,False,politics,120101
126319,M0b1u5,us: we are fucking idiots.,t3_5zep2,t3_5zep2,t1_c02cheu,True,politics,0


## pyEnchant is used to check if this is a real word.

* An issue with this apporach is words that are not english, but are used heavily (e.g. 'reddit')

In [31]:
import enchant
d = enchant.Dict("en_US")

In [32]:
[(d.check(word), word) for word in df.body[1].split()]

[(True, 'much'),
 (True, 'smoother.'),
 (False, 'im'),
 (True, 'just'),
 (True, 'glad'),
 (False, 'reddit'),
 (True, 'is'),
 (False, 'back,'),
 (False, 'linkreddit'),
 (True, 'in'),
 (False, 'mirc'),
 (True, 'was'),
 (True, 'entertaining'),
 (True, 'but'),
 (True, 'i'),
 (True, 'had'),
 (True, 'no'),
 (True, 'idea'),
 (True, 'how'),
 (True, 'addicted'),
 (True, 'i'),
 (True, 'had'),
 (True, 'become.'),
 (True, 'thanks'),
 (True, 'for'),
 (True, 'making'),
 (True, 'the'),
 (True, 'detox'),
 (True, 'somewhat'),
 (True, 'short.')]

In [33]:
import language_check

ModuleNotFoundError: No module named 'language_check'

In [None]:
tool = language_check.LanguageTool('en_US')

In [None]:
matches = tool.check(df.body[1])

In [None]:
tool.disabled.add("UPPERCASE_SENTENCE_START")
tool.disabled.add('I_LOWERCASE')

In [None]:
matches = tool.check(df.body[1])

### Remove comments with more than n words

In [34]:
def remove_large_comments(n, df):
    print("Length before:", df['body'].size)
    df = df[df['body'].map(lambda s: len(s.split(' '))) < n].reset_index(drop=True)
    show_len_update(df)
    return df

### Using the tokenizer from io_utils


In [35]:
import re
_WORD_SPLIT = re.compile("([.,!?\"':;)(])")
_DIGIT_RE   = re.compile(r"\d")

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))
    return [w for w in words if w]

In [36]:
# 
df['body'] = [basic_tokenizer(sentence) for sentence in df['body']]

In [37]:
df['body']

0                                                    [test]
1         [much, smoother, ., im, just, glad, reddit, is...
2         [can, we, please, deprecate, the, word, ", aja...
3         [oh, ,, i, see, ., fancy, schmancy, ", submitt...
4                                              [testing, .]
5                      [i, like, it, ., one, more, time, .]
6         [try, refreshing, yor, cache, ,, that, worked,...
7                    [k, ., i, lied, ., just, one, more, .]
8         [i, also, wonder, what, the, differences, are, .]
9                                        [so, addictive, .]
10        [i, can, ', t, post, a, story, to, proggit, -,...
11                              [alright, i, ', m, done, .]
12        [is, anyone, else, ', s, ", recommended, ", pa...
13        [ok, ,, i, guess, we, need, to, submit, commen...
14        [i, can, ', t, submit, any, stories, --, even,...
15                [working, fine, with, normal, adblock, .]
16        [can, ', t, see, beta, ., redd

In [38]:
a= [word for sentence in df['body'].values for word in sentence]
a

['test',
 'much',
 'smoother',
 '.',
 'im',
 'just',
 'glad',
 'reddit',
 'is',
 'back',
 ',',
 'linkreddit',
 'in',
 'mirc',
 'was',
 'entertaining',
 'but',
 'i',
 'had',
 'no',
 'idea',
 'how',
 'addicted',
 'i',
 'had',
 'become',
 '.',
 'thanks',
 'for',
 'making',
 'the',
 'detox',
 'somewhat',
 'short',
 '.',
 'can',
 'we',
 'please',
 'deprecate',
 'the',
 'word',
 '"',
 'ajax',
 '"',
 'now',
 '?',
 '(',
 'but',
 'yeah',
 ',',
 'this',
 '_is_',
 'much',
 'nicer',
 ')',
 'oh',
 ',',
 'i',
 'see',
 '.',
 'fancy',
 'schmancy',
 '"',
 'submitting',
 '.',
 '.',
 '"',
 'testing',
 '.',
 'i',
 'like',
 'it',
 '.',
 'one',
 'more',
 'time',
 '.',
 'try',
 'refreshing',
 'yor',
 'cache',
 ',',
 'that',
 'worked',
 'for',
 'me',
 'edit',
 ':',
 'trying',
 'to',
 'edit',
 'k',
 '.',
 'i',
 'lied',
 '.',
 'just',
 'one',
 'more',
 '.',
 'i',
 'also',
 'wonder',
 'what',
 'the',
 'differences',
 'are',
 '.',
 'so',
 'addictive',
 '.',
 'i',
 'can',
 "'",
 't',
 'post',
 'a',
 'story',
 'to'

In [39]:
#source : http://stackoverflow.com/questions/33093809/count-the-frequency-of-elements-in-list-of-lists-in-python/33093930
from itertools import chain
from collections import Counter
a= [word for sentence in df['body'].values for word in sentence]
word_freq = Counter(chain(a))
sorted_word_freq = sorted(word_freq.items(), key=lambda x: -x[1])

### Number of words used

In [40]:
sum([value for key, value in word_freq.items()])

5898318

### Number of 'valid' words

In [41]:
sum([value for key, value in word_freq.items() if d.check(key)])

5020458

In [42]:
show_len_update(df)

Now there are 126320 rows.


### We want to know how many sentences we have if we remove all senteces with invalid words.

In [45]:
def invalid_word(df):
    '''Goes through the content and determines whether an invalid word is 
    present.
    
    The data frame should provide a body field which will be inspected.
    '''
    d = enchant.Dict("en_US") 
    valid_sentences = [True] * len(df)
    misspelled_words = {}
     
    for idx, sentence in enumerate(df['body'].values):
        for word in sentence:
            if not d.check(word):
                if word in misspelled_words:
                    misspelled_words[word] += 1
                else:
                    misspelled_words[word] = 1
                valid_sentences[idx] = False
    print("There are %i valid sentences out of %i." % (sum(valid_sentences), len(valid_sentences)))
    print("There are %i misspelled words." % len(misspelled_words))
    return valid_sentences, misspelled_words

In [46]:
valid_sent, misspelled = invalid_word(df)

There are 11249 valid sentences out of 126320.
There are 63912 misspelled words.


In [47]:
print('Total number of valid sentences using basic english dictionary:')
print(sum(valid_sent))


Total number of valid sentences using basic english dictionary:
11249


In [48]:
df['spell'] = valid_sent

In [49]:
# We use this list to append some words to our dictionary.
sorted_mispelled_words = sorted(misspelled.items(), key=lambda x: -x[1])

### We write to a text-file the 1000 most common misspelled words

In [50]:
MISSPELLED_WORDS = '/Users/ivan/Desktop/mywords.txt'
f = open(MISSPELLED_WORDS, 'w')
for word in sorted_mispelled_words[:1000]:
    f.write(word[0] + "\n")
f.close()

In [51]:
sorted_mispelled_words[:5000]

[(',', 207292),
 ("'", 146616),
 ('"', 59729),
 ('?', 45454),
 ('(', 31024),
 (')', 28141),
 (':', 22230),
 ('!', 22036),
 ('-', 9009),
 (';', 6892),
 ('ve', 6387),
 ('doesn', 6003),
 ('isn', 4228),
 ('didn', 4103),
 ('reddit', 4077),
 ('paul', 3858),
 ('NUMBER%', 2897),
 ('ron', 2669),
 ('$NUMBER', 2139),
 ('wouldn', 2132),
 ('american', 1976),
 ('--', 1975),
 ('aren', 1961),
 ('america', 1762),
 ('iran', 1735),
 ('=', 1652),
 ('wasn', 1614),
 ('iraq', 1536),
 ('internet', 1409),
 ('ok', 1269),
 ('NUMBER/NUMBER', 1217),
 ('google', 1090),
 ('americans', 1080),
 ('israel', 1063),
 ('java', 1028),
 ('linux', 962),
 ('shouldn', 909),
 ('couldn', 850),
 ('english', 833),
 ('&amp', 822),
 ('c++', 819),
 ('NUMBER-NUMBER', 789),
 ('online', 770),
 ('clinton', 707),
 ('kucinich', 700),
 ('christian', 688),
 ('jews', 678),
 ('NUMBERs', 673),
 ('tv', 667),
 ('usa', 650),
 ('hillary', 634),
 ('canada', 630),
 ('dont', 604),
 ('spam', 604),
 ('NUMBERth', 602),
 ('lol', 574),
 ('europe', 570),
 ('

### Try and replace contractions

In [52]:
def reset(df):
    df = load_data()
    df = initial_clean(df)
    df,total_mods = clean_with_tracking(df)
    df = remove_large_comments(60, df)
    df = contraction_replacer(df)
    return df

In [53]:
contractions = { 
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he had",
        "he'd've": "he would have",
        "he'll": "he shall",
        "he'll've": "he shall have",
        "he's": "he has",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how has",
        "I'd": "I had",
        "I'd've": "I would have",
        "I'll": "I shall",
        "I'll've": "I shall have",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it had",
        "it'd've": "it would have",
        "it'll": "it shall",
        "it'll've": "it shall have",
        "it's": "it has",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she had",
        "she'd've": "she would have",
        "she'll": "she shall",
        "she'll've": "she shall have",
        "she's": "she has",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that has",
        "there'd": "there had",
        "there'd've": "there would have",
        "there's": "there has",
        "they'd": "they had",
        "they'd've": "they would have",
        "they'll": "they shall",
        "they'll've": "they shall have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we had",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what shall",
        "what'll've": "what shall have",
        "what're": "what are",
        "what's": "what has",
        "what've": "what have",
        "when's": "when has",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where has",
        "where've": "where have",
        "who'll": "who shall",
        "who'll've": "who shall have",
        "who's": "who has",
        "who've": "who have",
        "why's": "why has",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you had",
        "you'd've": "you would have",
        "you'll": "you shall",
        "you'll've": "you shall have",
        "you're": "you are",
        "you've": "you have"
        }

In [54]:
def contraction_replacer(df):
    for patrn in contractions.items():
        df['body'].replace({patrn[0]: patrn[1]}, regex=True, inplace=True)    
    return df

In [55]:
df = contraction_replacer(df)

In [56]:
df['body'] = [basic_tokenizer(sentence) for sentence in df['body']]

AttributeError: 'list' object has no attribute 'strip'

In [None]:
show_len_update(df)

In [None]:
valid_sentences, misspelled_words = invalid_word(df)

In [None]:
sorted_misspelled_words = sorted(misspelled_words.items(), key=lambda x: -x[1])

In [None]:
sorted_misspelled_words[]

In [None]:
MISSPELLED_WORDS = '/Users/ivan/Desktop/mywords.txt'
f = open(MISSPELLED_WORDS, 'w')
for word in sorted_mispelled_words[:10000]:
    f.write(word[0] + "\n")
f.close()

In [57]:
def invalid_word_modified(df):
    '''Goes through the content and determines whether an invalid word is 
    present.
    
    The data frame should provide a body field which will be inspected.
    '''
    d = enchant.DictWithPWL("en_US", MISSPELLED_WORDS)
    valid_sentences = [True] * len(df)
    misspelled_words = {}
     
    for idx, sentence in enumerate(df['body'].values):
        for word in sentence:
            if not d.check(word):
                if word in misspelled_words:
                    misspelled_words[word] += 1
                else:
                    misspelled_words[word] = 1
                valid_sentences[idx] = False
    print("There are %i valid sentences out of %i." % (sum(valid_sentences), len(valid_sentences)))
    print("There are %i misspelled words." % len(misspelled_words))
    return valid_sentences, misspelled_words

In [None]:
valid_sentences, misspelled_words = invalid_word_modified(df)
sorted_misspelled_words = sorted(misspelled_words.items(), key=lambda x: -x[1])

##### By adding 10000 extra words (not in the original dictionary) we only see 5000 more valid sentences

* Define a threshold which keeps sentences with many of the misspelled words
* For words that are in the original dictionary add 0 points. 
* Words that are not in the original dictionary add the inverse of the number of occurences in the corpus
* We then normalize to the length of the sentence.

In [58]:
df = reset(df)

Number of lines in raw data file 150429
'Column names from raw data file:'
Index(['archived', 'author', 'author_flair_css_class', 'author_flair_text',
       'body', 'controversiality', 'created_utc', 'distinguished', 'downs',
       'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id',
       'retrieved_on', 'score', 'score_hidden', 'subreddit', 'subreddit_id',
       'ups'],
      dtype='object')
Now there are 150429 rows.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Length before: 126320
Now there are 102363 rows.


In [59]:
sentences = [basic_tokenizer(sentence) for sentence in df['body']]
words= [word for sentence in sentences for word in sentence]

In [60]:
word_freq = Counter(chain(words))
sorted_word_freq = sorted(word_freq.items(), key=lambda x: -x[1])

In [67]:
def sentence_score(sentence):
    d = enchant.Dict('en_US')
    word_count = len(sentence)
    score = 0
    for word in sentence:
        if not d.check(word):
            try: 
                score = score + 1.0/word_freq[word]
            except ZeroDivisionError:
                score = score + 1.0
    try:
        return score / word_count
    except ZeroDivisionError:
        return 1

In [74]:
def add_sentence_scores(df):
    scores = []
    pbar = ProgressBar()
    for sentence in pbar(df.body):
        scores.append(sentence_score(basic_tokenizer(sentence)))
    df['score'] = scores

In [73]:
from progressbar import ProgressBar

In [None]:
add_sentence_scores(df)

 49% (285253 of 580784) |########          | Elapsed Time: 0:20:42 ETA: 0:21:07

In [None]:
# A plot which displays the distribution of "penalty score" of a sentence. 
plt.hist(df.score.values, bins=500)
plt.xlim(0, 0.2)

In [None]:
len(df.loc[df.score < 0.005])

In [61]:
def load_data():
    df = pd.read_json(RAW_DATA_FILES[0], lines=True)
    df2 = pd.read_json(RAW_DATA_FILES[1], lines=True)
    df3 = pd.read_json(RAW_DATA_FILES[2], lines=True)
    df = df.append(df2, ignore_index=True)
    df = df.append(df3, ignore_index=True)
    df = df.reset_index(drop=True)
    init_num_rows = len(df)
    print("Number of lines in raw data file", init_num_rows)
    pprint("Column names from raw data file:")
    pprint(df.columns)
    return df

In [62]:
df = reset(df)

Number of lines in raw data file 886802
'Column names from raw data file:'
Index(['archived', 'author', 'author_flair_css_class', 'author_flair_text',
       'body', 'controversiality', 'created_utc', 'distinguished', 'downs',
       'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id',
       'retrieved_on', 'score', 'score_hidden', 'subreddit', 'subreddit_id',
       'ups'],
      dtype='object')
Now there are 886802

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


 rows.
Length before: 707894
Now there are 580784 rows.


In [63]:
len(df)

580784

In [64]:
sentences = [basic_tokenizer(sentence) for sentence in df['body']]
words= [word for sentence in sentences for word in sentence]

In [65]:
word_freq = Counter(chain(words))
sorted_word_freq = sorted(word_freq.items(), key=lambda x: -x[1])

In [70]:
sorted_word_freq[:100]

[('.', 839911),
 ('the', 465182),
 (',', 431223),
 ('to', 273903),
 ('i', 261061),
 ('a', 259704),
 ('it', 200564),
 ('of', 199422),
 ('is', 189873),
 ('that', 187720),
 ('you', 185582),
 ('not', 185270),
 ('and', 183542),
 ('?', 156249),
 ('"', 140273),
 ('in', 134641),
 ('has', 123191),
 ("'", 115135),
 ('are', 108203),
 ('for', 104891),
 ('!', 100336),
 ('this', 84086),
 ('do', 80716),
 ('have', 75754),
 ('on', 73410),
 ('be', 73168),
 ('they', 72579),
 ('NUMBER', 70331),
 ('but', 63086),
 ('(', 62003),
 ('with', 61780),
 ('was', 61727),
 ('link', 55899),
 ('if', 55680),
 ('what', 53426),
 ('as', 52669),
 ('he', 50438),
 (')', 49593),
 ('just', 48642),
 (':', 48137),
 ('like', 46909),
 ('would', 44452),
 ('or', 43766),
 ('your', 43547),
 ('so', 43248),
 ('about', 42833),
 ('all', 42601),
 ('my', 40120),
 ('people', 39823),
 ('we', 38380),
 ('no', 38339),
 ('at', 38061),
 ('there', 37931),
 ('an', 36721),
 ('can', 34837),
 ('s', 34312),
 ('one', 34204),
 ('from', 32842),
 ('more', 32

In [69]:
add_sentence_scores(df)

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
plt.hist(df.score.values, bins=500)
plt.xlim(0, 0.2)

In [None]:
len(df.loc[df.score < 0.005])

In [None]:
## Returns a dictionary with keys being the root comments and values being their immediate children.
## Assumes to have a 'root' column already

## Go through all comments, if it is a root skip it since they wont have a parent_id corresponding
## to a comment.
## 
def children_dict(df):
    children = {}
    for row in df.itertuples():
        if row.root == False:
            if row.parent_id in children.keys():
                children[row.parent_id].append(row.name)
            else:
                children[row.parent_id] = [row.name]
    return children

In [None]:
## Return a dictionary with name being the key and body being the value. 
values_dict = pd.Series(df.body.values, index=df.name).to_dict()

In [None]:
children = children_dict(df)

In [None]:
## Generates two files, [from_file_path] and [to_file_path] of one-to-one comments. 
def generate_files(from_file_path, to_file_path):
    ## Open the files and clear them. 
    from_file = open(from_file_path, 'w')
    to_file = open(to_file_path, 'w')
    from_file.write("")
    to_file.write("")
    from_file.close()
    to_file.close()

    for key in children.keys():
        from_file = open(from_file_path, 'a')
        to_file = open(to_file_path, 'a')

        ## Since we have deleted comments, some comments parents might not exist anymore so we must catch that error.
        for child in children[key]:
            try: 
                from_file.write(values_dict[key].replace('\n', '').replace('\r', ' ').replace('&gt', '') + "\n")
                to_file.write(values_dict[child].replace('\n', '').replace('\r', ' ').replace('&gt', '') + "\n")
            except KeyError:    
                pass
    from_file.close()
    to_file.close()

In [None]:
generate_files("from_file.txt", "to_file.txt")