In [247]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pad_packed_sequence, pack_padded_sequence

### Import Data

In [215]:
#test = pd.read_csv("s3://advancedml-koch-mathur-hinkson/test.csv")

In [216]:
#test.head()

In [217]:
#train.head()

In [219]:
# train['label'] = train.target.apply(lambda x: assign_label(x))

In [220]:
#train.head()

In [221]:
#train = pd.read_csv("s3://advancedml-koch-mathur-hinkson/train.csv")

In [257]:
mini_preprocessed = pd.read_pickle('mini_preprocessed.pkl')

mini_preprocessed.head().style

In [12]:
assign_label = lambda x: 0 if x < 0.5 else 1


In [259]:
mini_preprocessed['label'] = mini_preprocessed.target.apply(lambda x: assign_label(x))


In [14]:
TOXIC_LABEL = 'toxic'
NOT_TOXIC_LABEL = 'not_toxic'  
VOCAB_SIZE = 7500

In [15]:
import torch
import torch.utils.data as tud

from collections import Counter, defaultdict

class TextData:
    def __init__(self, df, text_col='cleaned_no_stem'):
        # pull relevant data from df
        self.preprocessed_text = [word_list for word_list in df[text_col] ]
        
        # gather vocabulary corpus to store all words in training data
        self.vocab = Counter([word for comment in self.preprocessed_text 
                              for word in comment]
                            ).most_common(VOCAB_SIZE-1)

        # word to index mapping
        self.word_to_idx = {k[0]: v+1 for v, k in 
                            enumerate(self.vocab)}
        # all the unknown words will be mapped to index 0
        self.word_to_idx["UNK"] = 0 
        self.idx_to_word = {v:k for k, v in self.word_to_idx.items()}
        self.label_to_idx = {TOXIC_LABEL: 1, NOT_TOXIC_LABEL: 0}
        self.idx_to_label = [NOT_TOXIC_LABEL, TOXIC_LABEL]
        self.vocab = set(self.word_to_idx.keys())


In [16]:
class TextClassificationDataset(tud.Dataset):
    '''
    Get batched data for training and evaluation. 
    '''
    def __init__(self, word_to_idx, data):
        
        self.data = data
        self.word_to_idx = word_to_idx # dictionary {str: int}
        self.label_to_idx = {TOXIC_LABEL: 1, NOT_TOXIC_LABEL: 0} # dictionary
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = np.zeros(self.vocab_size)
        
        item = torch.from_numpy(item)
        # use both the document and label for training/ tuning (have comment and label)
        if len(self.data[idx]) == 2: 
            for word in word_tokenize(self.data[idx][0]):
                item[self.word_to_idx.get(word, 0)] += 1
            label = self.label_to_idx[self.data[idx][1]]
            return item, label
        
        else: # use the document without label for testing (have only comment)
            for word in word_tokenize(self.data[idx]):
                item[self.word_to_idx.get(word, 0)] += 1
            return item
        
        
     ##  Override single items' getter
    def __getitem__(self, idx):
        if idx + self.seq_length > self.__len__():
            if self.transforms is not None:
                item = torch.zeros(self.seq_length, self.dataset[0].__len__())
                item[:self.__len__()-idx] = self.transforms(self.dataset[idx:])
                return item, item
            else:
                item = []
                item[:self.__len__()-idx] = self.dataset[idx:]
                return item, item
        else:
            if self.transforms is not None:
                return self.transforms(self.dataset[idx:idx+self.seq_length]), self.transforms(self.dataset[idx:idx+self.seq_length])
            else:
                return self.dataset[idx:idx+self.seq_length], self.dataset[idx:idx+self.seq_length]


In [17]:
# comment_data = [tuple(x) for x in mini_preprocessed[['cleaned_no_stem', 'label']].values]

In [18]:
# comment_data[0]

## Create Data Loader

In [19]:
from fastai.text import *

In [275]:
CLASSES = [0, 1]
def get_texts(df, text_field="comment_text", label_field="label"):
    texts,labels = [],[]
    for idx,label in enumerate(CLASSES):
         for i in df.index:
            text = df.loc[i, text_field]
            label = df.loc[i, label_field]
#             print(text)
#             print(label)
#             print(fields)
#             examples.append(Example.fromlist([text, label], fields))
#         for fname in (path/label).glob('*.*'):
            texts.append(text)
            labels.append(label)
    return np.array(texts),np.array(labels), [label_field, text_field]

### Split Data into Train/ Test

In [260]:
from sklearn.model_selection import train_test_split

X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(mini_preprocessed.drop('label', axis=1), 
                                                                        mini_preprocessed['label'], 
                                                    test_size = 0.20, 
                                                    random_state = 4812)

print(X_train_data.shape, y_train_data.shape)
print(X_test_data.shape, y_test_data.shape)


(8000, 60) (8000,)
(2000, 60) (2000,)


In [276]:
trn_texts,trn_labels, col_names = get_texts(mini_preprocessed.loc[X_train_data.index])
val_texts,val_labels, _ = get_texts(mini_preprocessed.loc[X_test_data.index])
len(trn_texts),len(val_texts)

(16000, 4000)

In [21]:
col_names = ['labels','text']

### Shuffle indices in train and validation sets

In [277]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_texts))
val_idx = np.random.permutation(len(val_texts))

In [278]:
trn_texts = trn_texts[trn_idx]
val_texts = val_texts[val_idx]
trn_labels = trn_labels[trn_idx]
val_labels = val_labels[val_idx]

In [288]:
# df_trn = pd.DataFrame({'text':trn_texts, 'labels':trn_labels}, 
#                       columns=col_names)
# df_val = pd.DataFrame({'text':val_texts, 'labels':val_labels},
#                       columns=col_names)
# df_trn[df_trn['labels']!=2].to_csv(data/'train.csv',
#                                    header=False, index=False)
# df_val.to_csv(CLAS_PATH/'test.csv', header=False, index=False)
with open('data/classes.txt', 'w') as f:
    f.writelines(f'{o}\n' for o in CLASSES)

In [289]:
with open('data/classes.txt', 'r') as r:
    print(r.readlines())

['0\n', '1\n']


In [291]:
trn_texts,val_texts = train_test_split(
    np.concatenate([trn_texts,val_texts]), test_size=0.1)
len(trn_texts), len(val_texts)

(18000, 2000)

In [None]:
chunksize=24000

In [297]:
re1 = re.compile(r'  +')

def fixup(x):
    '''
    H/T to the fastai.text team!
    '''
    x = x.replace('#39;', "'")\
        .replace('amp;', '&')\
        .replace('#146;', "'")\
        .replace('nbsp;', ' ')\
        .replace('#36;', '$')\
        .replace('\\n', "\n")\
        .replace('quot;', "'")\
        .replace('<br />', "\n")\
        .replace('\\"', '"')\
        .replace('<unk>', 'u_n')\
        .replace(' @.@ ', '.')\
        .replace(' @-@ ', '-')\
        .replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [304]:
cores = !getconf _NPROCESSORS_ONLN
cores

['8']

In [298]:
def get_texts(df, n_lbls=1, fieldflag):
    '''
    H/T to the fastai.text team!
    '''
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): 
        texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)
    
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

In [28]:
import re
import time
import pandas as pd



def clean_text(text, stop_ws=True, stemmer='Porter', str_output=False):
    
    # intialize stemmers
    from nltk.stem import PorterStemmer, LancasterStemmer
    ps = PorterStemmer() 
    ls = LancasterStemmer()
    
    t = text.replace("-", " ").split(" ")
    t = [w.strip(string.punctuation) for w in t]
    
    if stop_ws:
        from nltk.corpus import stopwords
        # define stopwords
        stops = set(stopwords.words('english'))
        stops.add('')

        approved_stop_words = {"not", "get", "against", "haven", "haven't","aren't", 
                               "aren", "should", "shouldn", "shouldn't", "themselves", 
                               "them", "under", "over", 'won', "won't", "wouldn'", 
                               "wouldn't"}
        stops = stops - approved_stop_words
        t = [w.lower() for w in t if w not in stop_ws]
    
    if stemmer:
        if stemmer == 'Porter':
            t = [ps.stem(w) for w in t]
        elif stemmer == 'Lancaster':
            t = [ls.stem(w) for w in t]
    
    if str_output:
        return ' '.join(t)
    else:
        return t