In [1]:
## Import python library
import pandas as pd
import nltk

## Read file
file_name = "data/train.csv"
## Read file using pandas
df = pd.read_csv(file_name)

In [2]:
# load nltk's English stopwords as variable called 'stop' and don't find synonym of those words.
stop = nltk.corpus.stopwords.words('english')

In [4]:
## Tokenizing sentence into token for finding synonym.
def make_tokenizer(texts):
    from keras.preprocessing.text import Tokenizer
    t = Tokenizer()
    t.fit_on_texts(texts)
    return t

tokenizer = make_tokenizer(df['description'])    ## Message is column name

X = tokenizer.texts_to_sequences(df['description'])

from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, 70)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
## Dictionary of word index
index_word = {}
for word in tokenizer.word_index.keys():
    index_word[tokenizer.word_index[word]] = word

In [6]:
## word list
words = [value for key, value in index_word.items()]

In [7]:
## Function to find synonym of words 
import spacy
nlp = spacy.load('en', parser=False)
def check_lemma(t,w) :
    r = [d for d in t if (nlp(d.text)[0].lemma_ != nlp(w.text)[0].lemma_)]
    return r

def get_word_synonym(word):
  filtered_words = [w for w in word.vocab if (not w.lower_ in stop) and w.is_lower == word.is_lower and w.prob >= -15] ## (not w.lower_ in stop) and
  similarity = sorted(filtered_words, key=lambda w: word.similarity(w), reverse=True)
  filtered_similarity = check_lemma(similarity[:30], word)
  return filtered_similarity[:3]

In [8]:
## Synonym dictionary
synonym_dict = {}

for word in words:
    if (not check_oos(word)) :
        synonym_dict.update({word : tuple([w.lower_ for w in get_word_synonym(nlp.vocab[word])])})
        #print(word, " : ", [w.lower_ for w in get_word_synonym(nlp.vocab[word])])

NameError: name 'check_oos' is not defined

In [12]:
## Only consider filtered synonym
import collections
value_occurrences = collections.Counter(synonym_dict.values())

filtered_synonym = {key: value for key, value in synonym_dict.items() if value_occurrences[value] == 1}

In [13]:
## Function for augmenting data by replacing words with synonym using spaCy
## This might not be best best method compared to data augmentation using language translation

import re
import random
sr = random.SystemRandom()
split_pattern = re.compile(r'\s+')
def data_augmentation(message, aug_range=1) :
    augmented_messages = []
    for j in range(0,aug_range) :
        new_message = ""
        for i in filter(None, split_pattern.split(message)) :
            new_message = new_message + " " + sr.choice(filtered_synonym.get(i,[i]))
        augmented_messages.append(new_message)
    return augmented_messages

In [16]:
## Dictionary for intent count
## Intent is column name
intent_count = df.jobflag.value_counts().to_dict()

In [17]:
## Get max intent count to match other minority classes through data augmentation
import operator
max_intent_count = max(intent_count.items(), key=operator.itemgetter(1))[1]

In [20]:
## Loop to interate all messages
import numpy as np
import math
import tqdm
newdf = pd.DataFrame()
for intent, count in intent_count.items() :
    count_diff = max_intent_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in tqdm.tqdm(df[df["jobflag"] == intent]["description"]) :
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['description'])
            dummy1["jobflag"] = intent
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_augmentation(message, multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['description'])
            dummy2["jobflag"] = intent
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(df[df["jobflag"] == intent])

100%|██████████| 624/624 [00:01<00:00, 468.46it/s]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
100%|██████████| 583/583 [00:01<00:00, 486.29it/s]
100%|██████████| 348/348 [00:00<00:00, 463.68it/s]


In [22]:
## Print count of all new data points
newdf.jobflag.value_counts()

3    1376
2    1376
1    1376
4    1376
Name: jobflag, dtype: int64

In [25]:
## Save newdf back to file
newdf.to_csv("data/augumented_train.csv", index=False)