In [1]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc

In [2]:
pattern = re.compile(r"([\d]{1,2}[\/|\-][\d]{1,2}(?:[\/|\-][\d]{2,4})?|[\d]{2,4}[\/|\-][\d]{1,2}[\/|\-][\d]{1,2}|(?:january|february|march|april|may|june|july|august|september|october|november|december)[\s][\d]{1,2}[a-z][a-z](?:\s[\d]{2,4})|[\d][\d]\w?\w?\sof\s(?:january|february|march|april|may|june|july|august|september|october|november|december)(?:\s[\d]{2,4})?|(?:january|february|march|april|may|june|july|august|september|october|november|december)\s\d\d?\w?\w?,?(?:\s\d{2,4})?)")
def clean_string(s):
    s1 = pattern.sub("date",s)
    s2 = clean(s1, lower=True,
                no_line_breaks=True,
                no_emails=True,
                no_urls=True,
                no_numbers=True,
                lang="en",
                replace_with_number="num",
                replace_with_email="email",
                replace_with_url="url")
    return s2
def clean_dataframe(dataframe):
    dataframe['content'] = dataframe['content'].apply(clean_string)

In [3]:
import nltk
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *

def remove_english_stopwords(stopwords):
    def remove_stopwords(tokenlist):
        return list(filter(lambda x : x not in stopwords, tokenlist))
    return remove_stopwords

def stem_tokens():
    stemmer = PorterStemmer()
    def stem_tokenlist(tokenlist):
        return list(map(stemmer.stem, tokenlist))
    return stem_tokenlist


def preprocess(dataframe):
    # tokenize content column
    print("Tokenizing...")
    dataframe['content'] = dataframe['content'].apply(nltk.word_tokenize)
    tokens = list(itertools.chain.from_iterable(dataframe['content']))
    vocabulary = set(tokens)
    # remove stopwords
    print("Removing stopwords...")
    dataframe['content'] = dataframe['content'].apply(remove_english_stopwords(stopwords.words('english')))
    tokens_no_stopwords = list(itertools.chain.from_iterable(dataframe['content']))
    vocabulary_no_stopwords = set(tokens_no_stopwords)
    print("Reduction rate of removing stopwords: " + str(1 - len(vocabulary_no_stopwords) / len(vocabulary)))
    # stem tokens
    print("Stemming...")
    dataframe['content'] = dataframe['content'].apply(stem_tokens())
    tokens_stem = list(itertools.chain.from_iterable(dataframe['content']))
    print("Stemmed tokens = " + str(tokens_stem))
    vocabulary_stem = set(tokens_stem)
    print("Reduction rate of stemming: " + str(1 - len(vocabulary_stem)/len(vocabulary_no_stopwords)))

In [4]:
stem_tokens()(["running", "runs", "running"])

['run', 'run', 'run']

In [5]:
for chunck in pd.read_csv("data/sample_STRUCTURED.csv", chunksize=1000):
    print("cleaning...")
    clean_dataframe(chunck)
    print("preprocessing...")
    preprocess(chunck)
    chunck.to_csv("data/sample_preprocessed.csv", mode='w')
    break

cleaning...
preprocessing...
Tokenizing...
Removing stopwords...
Reduction rate of removing stopwords: 0.004279555681424796
Stemming...
Stemmed tokens = ['kasha', '--', 'audrey', 'sission', ',', 'lifelong', 'new', 'yorker', ',', 'born', 'decemb', 'num', ',', 'num', ',', 'die', 'june', 'num', ',', 'num', '.', 'daughter', 'late', 'helen', 'sisson-cerussi', 'theodor', 'r.', 'sisson', ';', 'mother', 'late', 'matthew', 'p.', 'kasha', ';', 'sister', 'b.', 'peter', 'cerussi', 'maxin', 'l.', 'sisson', 'gloria', 'm.', 'cerussi', '.', 'gravesid', 'servic', 'friday', ',', 'june', 'num', ',', '11am', ',', 'warwick', 'cemeteri', ',', 'warwick', ',', 'ny', '.', 'contribut', 'memori', 'anim', 'welfar', 'group', 'choic', 'would', 'appreci', '.', 'date', '``', 'forward', ',', 'light', 'brigad', '!', '``', 'man', 'dismay', "'d", '?', 'tho', "'", 'soldier', 'knewsom', 'one', 'blunder', "'d", ':', 'make', 'repli', ',', 'reason', ',', '&', 'die', ',', 'valley', 'deathrod', 'six', 'hundr', '.', '--', '``', 