In [41]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time

In [42]:
pattern = re.compile(r"([\d]{1,2}[\/|\-][\d]{1,2}(?:[\/|\-][\d]{2,4})?|[\d]{2,4}[\/|\-][\d]{1,2}[\/|\-][\d]{1,2}|(?:january|february|march|april|may|june|july|august|september|october|november|december)[\s][\d]{1,2}[a-z][a-z](?:\s[\d]{2,4})|[\d][\d]\w?\w?\sof\s(?:january|february|march|april|may|june|july|august|september|october|november|december)(?:\s[\d]{2,4})?|(?:january|february|march|april|may|june|july|august|september|october|november|december)\s\d\d?\w?\w?,?(?:\s\d{2,4})?)")
def clean_string(st):
    s1 = pattern.sub("date",st)
    return clean(s1, lower=True,
                    no_line_breaks=True,
                    no_emails=True,
                    no_urls=True,
                    no_numbers=True,
                    no_punct=True,
                    lang="en",
                    replace_with_number="num",
                    replace_with_email="email",
                    replace_with_url="url")
def clean_dataframe(dataframe):
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(clean_string)
    end = time.time()
    print("cleaning took " + str(end - start) + " seconds")

In [43]:
def remove_english_stopwords(stopwords):
    def remove_stopwords(tokenlist):
        return filter(lambda x : x not in stopwords, tokenlist)
    return remove_stopwords

def stem_tokens():
    stemmer = PorterStemmer()
    def stem_tokenlist(tokenlist):
        return map(stemmer.stem, tokenlist)
    return stem_tokenlist

def tokenize():
    def tokenize_text(s):
        return list((map(nltk.word_tokenize, s)))
    return tokenize_text

def to_list():
    def turn_to_list(it):
        return list(it)
    return turn_to_list

def preprocess(dataframe):
    # tokenize content column
    #print("Tokenizing...")
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(nltk.word_tokenize)
    end = time.time()
    print("tokenizing took " + str(end - start) + " seconds")
    #tokens = list(itertools.chain.from_iterable(dataframe['content']))
    #vocabulary = set(tokens)
    # remove stopwords
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(remove_english_stopwords(stopwords.words('english')))
    end = time.time()
    print("removing stopwords took " + str(end - start) + " seconds")
    #tokens_no_stopwords = list(itertools.chain.from_iterable(dataframe['content']))
    #vocabulary_no_stopwords = set(tokens_no_stopwords)
    #print("Reduction rate of removing stopwords: " + str(1 - len(vocabulary_no_stopwords) / len(vocabulary)))
    # stem tokens
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(stem_tokens())
    end = time.time()
    print("stemming took " + str(end - start) + " seconds")

    start = time.time()
    dataframe['content'] = dataframe['content'].apply(to_list())
    end = time.time()
    print("converting to list took" + str(end - start) + " seconds")

    #tokens_stem = list(itertools.chain.from_iterable(dataframe['content']))
    #print("Stemmed tokens = " + str(tokens_stem))
    #vocabulary_stem = set(tokens_stem)
    #print("Reduction rate of stemming: " + str(1 - len(vocabulary_stem)/len(vocabulary_no_stopwords)))

In [44]:
stem_tokens()(["running", "runs", "running"])

<map at 0x7ff1afd729e0>

In [45]:
pattern = re.compile(r"([\d]{1,2}[\/|\-][\d]{1,2}(?:[\/|\-][\d]{2,4})?|[\d]{2,4}[\/|\-][\d]{1,2}[\/|\-][\d]{1,2}|(?:january|february|march|april|may|june|july|august|september|october|november|december)[\s][\d]{1,2}[a-z][a-z](?:\s[\d]{2,4})|[\d][\d]\w?\w?\sof\s(?:january|february|march|april|may|june|july|august|september|october|november|december)(?:\s[\d]{2,4})?|(?:january|february|march|april|may|june|july|august|september|october|november|december)\s\d\d?\w?\w?,?(?:\s\d{2,4})?)")
stemmer = PorterStemmer()
def process_string(s):
    s1 = pattern.sub("date",s)
    cleaned_string = clean(s1, lower=True,
                no_line_breaks=True,
                no_emails=True,
                no_urls=True,
                no_numbers=True,
                lang="en",
                replace_with_number="num",
                replace_with_email="email",
                replace_with_url="url")
    #print("tokenizing...")
    tokens = nltk.word_tokenize(cleaned_string)
    #print("removing stopwords...")
    tokens_no_stopwords = filter(lambda x : x not in stopwords.words('english'), tokens)
    #print("stemming...")
    stem_tokens = list(map(stemmer.stem, tokens_no_stopwords))
    return stem_tokens

In [48]:
sample = pd.read_csv("sample_preprocessed_no_punct.csv")

In [49]:
sample.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0.0,11467794,8027,9787368,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"['kashaaudrey', 'sission', 'lifelong', 'new', ...",2018-02-11 00:48:58.787555,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths KASHA, AUDREY SISSION",,,['KASHA AUDREY SISSION'],"KASHA--Audrey Sission, a lifelong New Yorker, ...",,,nytimes
1,1.0,4591569,8760,3009581,thinkprogress.org,political,https://thinkprogress.org/into-the-valley-of-d...,"['forward', 'light', 'brigadewa', 'man', 'dism...",2017-11-18T20:01:27.400599,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,"Into The Valley Of Death Rode The 600, Into Th...",,,[''],,"#Climate Change, #Climate",,
2,2.0,6162754,2521,4251195,truthandaction.org,bias,http://www.truthandaction.org/woman-thrown-off...,"['woman', 'thrown', 'plane', 'said', 'hillari'...",2017-11-27T01:15:32.269834,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Woman Thrown Off Plane When She Said Hillary i...,,,[''],,,,
3,3.0,2542246,2019,1769246,ecowatch.com,political,https://www.ecowatch.com/cuban-province-well-o...,"['cuban', 'provinc', 'well', 'way', 'num', 're...",2017-11-10T11:18:44.524042,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Cuban Province Well on Its Way to 100% Renewab...,"Guest Contributor, Sierra Club, Common Dreams,...",,"['featured', 'renewables', 'business', 'cuba']",President Obama’s recent announcement that he ...,,,
4,4.0,3753783,4806,2429386,weeklystandard.com,political,http://www.weeklystandard.com/print/the-times-...,"['new', 'york', 'time', 'greet', 'deleg', 'fro...",2017-11-13T18:09:27.760857,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,The Times Repeats Itself,To The Scrapbook,,['The Scrapbook'],The New York Times greeted delegates with a fr...,,,


In [53]:
len(sample[sample['type'] == 'reliable'])

191276

In [50]:
file_name = "data/sample_preprocessed.csv"
with open(file_name, "w") as file:
    pass
for chunck in pd.read_csv("data/sample_STRUCTURED.csv", chunksize=10000):
    clean_dataframe(chunck)
    preprocess(chunck)
    start = time.time()
    chunck.to_csv(file_name, mode='a')
    end = time.time()
    print("writing to csv took " + str(end - start) + " seconds")

KeyboardInterrupt: 

In [None]:
sample = pd.read_csv("data/sample_preprocessed.csv", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: 'data/sample_preprocessed.csv'

In [None]:
sample.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
0,0.0,11467794,8027,9787368,nytimes.com,reliable,https://query.nytimes.com/gst/fullpage.html?re...,"['kasha', '--', 'audrey', 'sission', ',', 'lif...",2018-02-11 00:48:58.787555,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,"Paid Notice: Deaths KASHA, AUDREY SISSION",,,['KASHA AUDREY SISSION'],"KASHA--Audrey Sission, a lifelong New Yorker, ...",,,nytimes
1,1.0,4591569,8760,3009581,thinkprogress.org,political,https://thinkprogress.org/into-the-valley-of-d...,"['``', 'forward', ',', 'light', 'brigad', '!',...",2017-11-18T20:01:27.400599,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,"Into The Valley Of Death Rode The 600, Into Th...",,,[''],,"#Climate Change, #Climate",,
2,2.0,6162754,2521,4251195,truthandaction.org,bias,http://www.truthandaction.org/woman-thrown-off...,"['woman', 'thrown', 'plane', 'said', 'hillari'...",2017-11-27T01:15:32.269834,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Woman Thrown Off Plane When She Said Hillary i...,,,[''],,,,
3,3.0,2542246,2019,1769246,ecowatch.com,political,https://www.ecowatch.com/cuban-province-well-o...,"['cuban', 'provinc', 'well', 'way', 'num', '%'...",2017-11-10T11:18:44.524042,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Cuban Province Well on Its Way to 100% Renewab...,"Guest Contributor, Sierra Club, Common Dreams,...",,"['featured', 'renewables', 'business', 'cuba']",President Obama’s recent announcement that he ...,,,
4,4.0,3753783,4806,2429386,weeklystandard.com,political,http://www.weeklystandard.com/print/the-times-...,"['new', 'york', 'time', 'greet', 'deleg', 'fro...",2017-11-13T18:09:27.760857,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,The Times Repeats Itself,To The Scrapbook,,['The Scrapbook'],The New York Times greeted delegates with a fr...,,,


In [None]:
sample.tail()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
929394,929376.0,1658197,7865,849316,dailykos.com,political,https://www.dailykos.com/stories/2017/01/25/16...,"['u.s.', 'naval', 'base', 'guantanamo', 'bay',...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Spokesman denies reports that Trump regime pla...,"Backgroundurl Avatar_Large, Nickname, Joined, ...",,[''],,,,
929395,929377.0,880814,6934,690264,ecowatch.com,political,https://www.ecowatch.com/rooftop-solar-provide...,"['cowri', 'collect', 'member', 'particip', 'ti...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Rooftop Solar Provides Net Benefits to All Nev...,"Natural Resources Defense Council, Yes, The Co...",,"['energy', 'renewables', 'featured']",The Natural Resources Defense Council (NRDC) a...,,,
929396,929378.0,11467044,7277,9786618,nytimes.com,reliable,https://www.nytimes.com/2016/06/13/sports/hock...,"['murray', ',', 'num', ',', 'ad', ':', '``', '...",2018-02-11 00:48:58.399133,2018-02-11 00:14:20.346838,2018-02-11 00:14:20.346871,Penguins Finish Off Sharks to Win Stanley Cup,David Pollak,,"['Hockey Ice', 'Stanley Cup', 'Playoff Games'...",Pittsburgh won its second Stanley Cup in eight...,,,nytimes
929397,929379.0,6510986,753,4640342,express.co.uk,rumor,https://www.express.co.uk/showbiz/tv-radio/726...,"['num-year-old', 'actor', ',', 'former', 'east...",2017-11-27T01:14:33.570665,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,I’m A Celebrity 2016: Is Larry Lamb joining th...,Rory O'Connor,,[''],LARRY LAMB is the latest celebrity to have agr...,,,
929398,929380.0,6794053,459,4934743,dailykos.com,political,https://www.dailykos.com/news/SameSexBinationa...,"['leandra', 'english', 'file', 'suit', 'seek',...",2017-11-27T01:14:21.395055,2018-02-07 23:39:33.852671,2018-02-07 23:39:33.852696,Daily Kos: SameSexBinationalCouples,"Happy Cog Studios - Http, Www.Happycog.Com, Da...",,[''],,Next,,
