In [1]:
import numpy as np 
import pandas as pd
import string

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aayushmarishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv('../Data/blog_2030.csv')
data['text'] = data.apply(lambda row: row.text.lower(), axis=1)
data

Unnamed: 0,text
0,"urllink im new to this, ..."
1,election time has rolled aro...
2,"well, i hate to start off no such a sou..."
3,i was a weedy child. this wasn't so muc...
4,i have developed a pain in my chest tha...
...,...
319967,"dear susan, i could write some really ..."
319968,"dear susan, 'i have the second yeast i..."
319969,"dear susan, your 'boyfriend' is fuckin..."
319970,"dear susan: just to clarify, i am as..."


In [3]:
filter_english = data.loc[data.text.str.contains("the")\
                         ].loc[data.text.str.contains("and")\
                              ].reset_index(drop=True)

In [4]:
# Split into sentences
sentence_list = []
blogs = filter_english['text'].tolist()
for b in blogs:
    # nltk.tokenize.sent_tokenize can split paragraphs into sentences
    # according to common sentence-ending punctuations
    sentences = nltk.tokenize.sent_tokenize(b)
    
    # Filter out sentences that include the word 'urllink'
    sentences = ['' if 'urllink' in s else s for s in sentences]
    
    # Remove punctuations and filter length (both a lower and upper
    # limit are imposed to make the data more consistent)
    sentences = [s.strip(string.punctuation\
                        ).strip() if (len(s)>50 and len(s) < 150\
                                     ) else '' for s in sentences]
    
    # Filter out empty sentences
    sentences = list(filter(None, sentences))
    sentence_list += sentences

In [5]:
print("There are %d sentences in total" % (len(sentence_list)))

There are 1908292 sentences in total


### Example of the above for loop

In [6]:
# Example paragraph
display(blogs[5])

"       (it is never good to start on a digression. however)  a digression: you know those conversations that go something along these lines?  -you're in a weird mood. -no i'm not. -yes you are, what's wrong? -i'm not in a weird mood! -don't shout at me!   i hate those.   so, to the point: patriotism. in specific reference to the england football team being knocked out of [whatever tournament it is that's on at the moment] this evening. it was an exciting match, but i'm glad that england lost. note i didn't say 'we'. and why? because those eleven men playing football out there aren't doing it for me; they don't know me, i don't know them, i didn't ask them to play football for me. in no way do they represent me, and certainly not as a result of them being from the same country as me.  ergh. but i mustn't go on so. calm down.   blogger food of the day: uncle ben's express rice. genius! it comes in a packet that stands up in your microwave, it takes two minutes, you don't have to refrige

In [7]:
# Use the nltk function to split the example paragraph into sentences
sentences = nltk.tokenize.sent_tokenize(blogs[5])
display(sentences)

['       (it is never good to start on a digression.',
 'however)  a digression: you know those conversations that go something along these lines?',
 "-you're in a weird mood.",
 "-no i'm not.",
 "-yes you are, what's wrong?",
 "-i'm not in a weird mood!",
 "-don't shout at me!",
 'i hate those.',
 'so, to the point: patriotism.',
 "in specific reference to the england football team being knocked out of [whatever tournament it is that's on at the moment] this evening.",
 "it was an exciting match, but i'm glad that england lost.",
 "note i didn't say 'we'.",
 'and why?',
 "because those eleven men playing football out there aren't doing it for me; they don't know me, i don't know them, i didn't ask them to play football for me.",
 'in no way do they represent me, and certainly not as a result of them being from the same country as me.',
 'ergh.',
 "but i mustn't go on so.",
 'calm down.',
 "blogger food of the day: uncle ben's express rice.",
 'genius!',
 "it comes in a packet that sta

In [8]:
# Strip the sentences of their punctuations and add a length filter
sentences = [s.strip(string.punctuation
                    ).strip() if (len(s)>50 and len(s) < 150
                                 ) else '' for s in sentences]
sentences

['',
 'however)  a digression: you know those conversations that go something along these lines',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 "in specific reference to the england football team being knocked out of [whatever tournament it is that's on at the moment] this evening",
 "it was an exciting match, but i'm glad that england lost",
 '',
 '',
 '',
 'in no way do they represent me, and certainly not as a result of them being from the same country as me',
 '',
 '',
 '',
 '',
 '',
 "it comes in a packet that stands up in your microwave, it takes two minutes, you don't have to refrigerate it, and its quite tasty",
 "especially at one in the morning when you're all toasted out",
 "hugs: alex and her talk of bra shopping uncle ben's express rice staplers (a bit random, i know, but they are cool, aren't they",
 '',
 '',
 "the manchester cow parade   slaps: indigestion from eating toast and rice too quickly   (sheesh...)  and no, i'm *not* in a mood...  n"]

In [9]:
# Delete sentences that have been turned into empty strings
sentences = list(filter(None, sentences))
sentences

['however)  a digression: you know those conversations that go something along these lines',
 "in specific reference to the england football team being knocked out of [whatever tournament it is that's on at the moment] this evening",
 "it was an exciting match, but i'm glad that england lost",
 'in no way do they represent me, and certainly not as a result of them being from the same country as me',
 "it comes in a packet that stands up in your microwave, it takes two minutes, you don't have to refrigerate it, and its quite tasty",
 "especially at one in the morning when you're all toasted out",
 "hugs: alex and her talk of bra shopping uncle ben's express rice staplers (a bit random, i know, but they are cool, aren't they",
 "the manchester cow parade   slaps: indigestion from eating toast and rice too quickly   (sheesh...)  and no, i'm *not* in a mood...  n"]

### Output

In [10]:
df = pd.DataFrame(sentence_list)
df

Unnamed: 0,0
0,election time has rolled around again
1,and everyone is spitting their venom at each o...
2,instead of pointing your greedy little fingers...
3,"im sick of hearing what he did, how badly they..."
4,i guess it isnt like anyone is bound to believ...
...,...
1908287,i have been getting tired of writing my wacky ...
1908288,then i discovered this wonderful site called '...
1908289,so now i've decided to devote my sardonic song...
1908290,i'll be posting bits and pieces of works in pr...


In [12]:
sample = df.sample(n = 10000).reset_index(drop=True)
sample.to_csv('../Data/sample_2030.csv', index=False)

In [13]:
sample3 = df.sample(n = 5000).reset_index(drop=True)
sample3.to_csv('../Data/sample2_2030.csv', index=False)