In [3]:
import os
import pandas as pd
import re

def read_data(f_name):
    data = pd.read_csv(os.path.join(os.path.join(os.getcwd(), "Dataset"), f_name))
    sentences = [clean_str(s) for s in data["Sentence"]]
    categories = data["Category"]
    return sentences, categories
    # return data

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9,!?\'\`\.]", " ", string)
    string = re.sub(r"\.{3}", " ...", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def Tokenizer(tokenizer, clean_string=True): # whitespace, regex, spacy, nltk
    tokenizer = tokenizer.lower()

    # Tokenize with whitespace
    if tokenizer == 'whitespace':
        tokenize = lambda string: string.strip().split()

    if tokenizer == 'regex':
        import re
        pattern = r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+"
        tokenize = lambda string: re.findall(pattern, string)

    if tokenizer == 'spacy':
        import spacy
        nlp = spacy.load('en')
        tokenize = lambda string: [token.text for token in nlp(string)]

    # Tokenize with punctuations other than periods
    if tokenizer == 'nltk':
        from nltk import word_tokenize
        tokenize = word_tokenize

    if clean_string:
        string = clean_str(string)
        return tokenize(string)

In [4]:
import preprocessing as prep
from preprocessing import Tokenizer

sents, cates = read_data("train_final.csv")
tokenizer = Tokenizer("regex")

for sent in sents[:5]:
    print(tokenizer(sent))


Loading regex tokenizer
['lrb', 'the', 'film', 'rrb', 'tackles', 'the', 'topic', 'of', 'relationships', 'in', 'such', 'a', 'straightforward', 'emotionally', 'honest', 'manner', 'that', 'by', 'the', 'end', 'it', "'s", 'impossible', 'to', 'ascertain', 'whether', 'the', 'film', 'is', 'at', 'its', 'core', 'deeply', 'pessimistic', 'or', 'quietly', 'hopeful']
['lavishly', 'exhilaratingly', 'tasteless']
['it', 'is', 'also', 'beautifully', 'acted']
['but', 'like', 'silence', 'it', "'s", 'a', 'movie', 'that', 'gets', 'under', 'your', 'skin']
['it', "'s", 'been', 'made', 'with', 'an', 'innocent', 'yet', 'fervid', 'conviction', 'that', 'our', 'hollywood', 'has', 'all', 'but', 'lost']
