In [33]:
import re
import nltk
import numpy as np
import torch
from nltk.corpus import stopwords
from processor import create_edited_sentences, lookup_glove
import codecs
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kapilan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
def hyphen_tokenizer(data):
    tokenized_corpus = []  # Let us put the tokenized corpus in a list
    for sentence in data:
        sentence = sentence.lower()
        tokenized_sentence = []
        for token in sentence.split(' '):  # simplest split is
            for tok in token.split('-'):
                tokenized_sentence.append(tok)
        tokenized_corpus.append(tokenized_sentence)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                if True:
                    vocabulary.append(token)
    return vocabulary, tokenized_corpus

In [35]:
def basic_tokenizer(data):
    tokenized_corpus = []  # Let us put the tokenized corpus in a list
    for sentence in data:
        sentence = sentence.lower()
        tokenized_sentence = []
        for token in sentence.split(' '):  # simplest split is
            tokenized_sentence.append(token)
        tokenized_corpus.append(tokenized_sentence)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                if True:
                    vocabulary.append(token)
    return vocabulary, tokenized_corpus

In [36]:
def hyphen_tokenizer_no_numbers(data):
    tokenized_corpus = []  # Let us put the tokenized corpus in a list
    for sentence in data:
        sentence = sentence.lower()
        tokenized_sentence = []
        for token in sentence.split(' '):  # simplest split is
            for tok in token.split('-'):
                tok = re.sub("\d+", "", tok)
                tokenized_sentence.append(tok)
        tokenized_corpus.append(tokenized_sentence)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                if True:
                    vocabulary.append(token)
    return vocabulary, tokenized_corpus


In [37]:
train_df = pd.read_csv('data/task-1/train.csv')
test_df = pd.read_csv('data/task-1/dev.csv')

training_data = train_df['original']
training_edits = train_df['edit']
test_data = test_df['original']
test_edits = test_df['edit']

edited_training = pd.Series(create_edited_sentences(training_data, training_edits))
edited_test = pd.Series(create_edited_sentences(test_data, test_edits))

In [43]:
def build_glove_dictionary(embedding_dim):
    word2embedding = {}
    with codecs.open('model-downloads/glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(embedding_dim), 'r', 'utf-8') as f:
        for line in f.readlines():
            if len(line.strip().split()) > 3:
                word = line.strip().split()[0]
                word2embedding[word] = np.array(list(map(float, line.strip().split()[1:])))

    return word2embedding

def build_embedding_tensor(vocab, embedding_dim=50):
    glove_vectors = np.zeros((len(vocab) + 1, embedding_dim))
    word2embedding = build_glove_dictionary(embedding_dim)
    words_not_in_glove = 0
    for i, word in enumerate(vocab):
        glove_vec, in_glove = lookup_glove(word2embedding, word, embedding_dim)
        glove_vectors[i + 1] = glove_vec
        words_not_in_glove += in_glove
    print("Number of words not in GloVe: {}".format(words_not_in_glove), flush=True)

    return torch.from_numpy(glove_vectors).type(torch.float32), words_not_in_glove

In [44]:
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

def stemming(data):
    tokenized_corpus = []  # Let us put the tokenized corpus in a list
    for sentence in data:
        sentence = sentence.lower()
        tokenized_sentence = []
        for token in sentence.split(' '):  # simplest split is
            for tok in token.split('-'):
                tok = re.sub("\d+", "", tok)
                tok = porter.stem(tok)
                tokenized_sentence.append(tok)
        tokenized_corpus.append(tokenized_sentence)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                if True:
                    vocabulary.append(token)
    return vocabulary, tokenized_corpus

def lemmatize(data):
    tokenized_corpus = []  # Let us put the tokenized corpus in a list
    for sentence in data:
        sentence = sentence.lower()
        tokenized_sentence = []
        for token in sentence.split(' '):  # simplest split is
            for tok in token.split('-'):
                tok = re.sub("\d+", "", tok)
                tok = wordnet_lemmatizer.lemmatize(tok)
                tokenized_sentence.append(tok)
        tokenized_corpus.append(tokenized_sentence)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                if True:
                    vocabulary.append(token)
    return vocabulary, tokenized_corpus

def remove_stopwords(data):
    tokenized_corpus = []  # Let us put the tokenized corpus in a list
    for sentence in data:
        sentence = sentence.lower()
        tokenized_sentence = []
        for token in sentence.split(' '):  # simplest split is
            for tok in token.split('-'):
                if tok in stopwords.words('english'):
                    continue
                tok = re.sub("\d+", "", tok)
                tokenized_sentence.append(tok)
        tokenized_corpus.append(tokenized_sentence)
    # Create single list of all vocabulary
    vocabulary = []  # Let us put all the tokens (mostly words) appearing in the vocabulary in a list
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                if True:
                    vocabulary.append(token)
    return vocabulary, tokenized_corpus

In [45]:
joint_vocab, joint_tokenized_corpus = remove_stopwords(pd.concat([edited_training, edited_test]))

_, words_not_in_glove = build_embedding_tensor(joint_vocab, 100)


Word 
Word chibok
Word nagorno
Word manafort
Word mnuchin
Word analytica
Word microsecond
Word tillerson
Word manassian
Word hartzler
Word nattering
Word nabobs
Word doddering
Word dotards
Word deplorableness
Word jives
Word tipline
Word gorsuch
Word scalise
Word ukraines
Word recuses
Word halfhearted
Word iannucci
Word kushners
Word brexit
Word secessionist
Word gravediggers
Word pallbearer
Word delingpole
Word orangeness
Word zinke
Word matzos
Word navajos
Word pocohontas
Word aecon
Word consummated
Word hichilema
Word equivocation
Word canoodles
Word #war
Word mcenany
Word peskov
Word nondisclosure
Word reenactors
Word graveling
Word cyberweapons
Word nightgowns
Word roiled
Word warmbier
Word careening
Word #metoo
Word guantรกnamo
Word scaramucci
Word arbaeen
Word lavishes
Word infowars.com
Word straggled
Word myeshia
Word falzone
Word rescinding
Word reanimating
Word nakedly
Word sympathising
Word scrutinising
Word deripaska
Word npr/ipsos
Word rosenstein
Word higbie
Word roboticis