In [1]:
import numpy as np 
import pandas as pd
import string

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aayushmarishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv('../Data/blog_over_30.csv')
data['text'] = data.apply(lambda row: row.text.lower(), axis=1)
data

Unnamed: 0,text
0,thanks to yahoo!'s toolbar i can ...
1,i had an interesting conversation...
2,somehow coca-cola has a way of su...
3,"if anything, korea is a country o..."
4,take a read of this news article ...
...,...
123317,i ran around cleaning all mor...
123318,we just got back from six fla...
123319,urllink i felt kind of ...
123320,"wow, i love fruit. i read th..."


In [3]:
filter_english = data.loc[data.text.str.contains("the")\
                         ].loc[data.text.str.contains("and")\
                              ].reset_index(drop=True)

In [4]:
# Split into sentences
sentence_list = []
blogs = filter_english['text'].tolist()
for b in blogs:
    # nltk.tokenize.sent_tokenize can split paragraphs into sentences
    # according to common sentence-ending punctuations
    sentences = nltk.tokenize.sent_tokenize(b)
    
    # Filter out sentences that include the word 'urllink'
    sentences = ['' if 'urllink' in s else s for s in sentences]
    
    # Remove punctuations and filter length (both a lower and upper
    # limit are imposed to make the data more consistent)
    sentences = [s.strip(string.punctuation\
                        ).strip() if (len(s)>50 and len(s) < 150\
                                     ) else '' for s in sentences]
    
    # Filter out empty sentences
    sentences = list(filter(None, sentences))
    sentence_list += sentences

In [5]:
print("There are %d sentences in total" % (len(sentence_list)))

There are 762373 sentences in total


### Example of the above for loop

In [6]:
# Example paragraph
display(blogs[5])

"             i surf the english news sites a lot looking for tidbits on korea and how foreigners (like me) view the 'hermit kingdom' but also as a way to keep up with this fast-moving place.  sometimes, though, one needs to check the veracity of the figures put in the papers...especially the local ones.  here are two examples of how the english version of the korea times and that of the joongang ilbo (daily).  the first is pretty straightforward.   urllink korea times  said that 249 people were arrested for forging korean passports, but  urllink joongang ilbo  says just 114 were accused.  huh?  another one:  urllink joongang ilbo  said that s&p is positive on korean banks (a good thing), while the  urllink korea times  said that s&p was a tad worried about the bad loans that banks extended to small and medium-sized firms.  i have no idea why the simple facts seem to be presented so differently...it can't simply be translation, can it?         "

In [7]:
# Use the nltk function to split the example paragraph into sentences
sentences = nltk.tokenize.sent_tokenize(blogs[5])
display(sentences)

["             i surf the english news sites a lot looking for tidbits on korea and how foreigners (like me) view the 'hermit kingdom' but also as a way to keep up with this fast-moving place.",
 'sometimes, though, one needs to check the veracity of the figures put in the papers...especially the local ones.',
 'here are two examples of how the english version of the korea times and that of the joongang ilbo (daily).',
 'the first is pretty straightforward.',
 'urllink korea times  said that 249 people were arrested for forging korean passports, but  urllink joongang ilbo  says just 114 were accused.',
 'huh?',
 'another one:  urllink joongang ilbo  said that s&p is positive on korean banks (a good thing), while the  urllink korea times  said that s&p was a tad worried about the bad loans that banks extended to small and medium-sized firms.',
 "i have no idea why the simple facts seem to be presented so differently...it can't simply be translation, can it?"]

In [8]:
# Strip the sentences of their punctuations and add a length filter
sentences = [s.strip(string.punctuation
                    ).strip() if (len(s)>50 and len(s) < 150
                                 ) else '' for s in sentences]
sentences

['',
 'sometimes, though, one needs to check the veracity of the figures put in the papers...especially the local ones',
 'here are two examples of how the english version of the korea times and that of the joongang ilbo (daily',
 '',
 'urllink korea times  said that 249 people were arrested for forging korean passports, but  urllink joongang ilbo  says just 114 were accused',
 '',
 '',
 "i have no idea why the simple facts seem to be presented so differently...it can't simply be translation, can it"]

In [9]:
# Delete sentences that have been turned into empty strings
sentences = list(filter(None, sentences))
sentences

['sometimes, though, one needs to check the veracity of the figures put in the papers...especially the local ones',
 'here are two examples of how the english version of the korea times and that of the joongang ilbo (daily',
 'urllink korea times  said that 249 people were arrested for forging korean passports, but  urllink joongang ilbo  says just 114 were accused',
 "i have no idea why the simple facts seem to be presented so differently...it can't simply be translation, can it"]

### Output

In [10]:
df = pd.DataFrame(sentence_list)
df

Unnamed: 0,0
0,i had an interesting conversation with my dad ...
1,we were talking about where koreans put their ...
2,"invariably, they have a lot of real estate and..."
3,cash would include short term investments unde...
4,"also, all of it is denominated in won which, d..."
...,...
762368,i ended the night with one glass of zinfandel ...
762369,i am quitting this vile habit on my 35th birth...
762370,most days are like this - last week was worse ...
762371,i have a bit of a hard time with peer pressure...


In [13]:
sample = df.sample(n = 10000).reset_index(drop=True)
sample.to_csv('../Data/sample_above30.csv', index=False)

In [14]:
sample3 = df.sample(n = 5000).reset_index(drop=True)
sample3.to_csv('../Data/sample2_above30.csv', index=False)