In [1]:
%matplotlib inline
from keras.preprocessing.text import text_to_word_sequence
from utils import *
from __future__ import division, print_function
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

Using TensorFlow backend.


## Load the data

In [2]:
path = "data/"
batch_size=64
max_features = 20000
maxlen = 100

In [3]:
_trainData = pd.read_csv(path+'train.csv')

In [4]:
list_sentences_train = _trainData["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
labels_train = _trainData[list_classes].values

In [5]:
_testData = pd.read_csv(path+'test.csv')

In [6]:
list_sentences_test = _testData["comment_text"].fillna("_na_").values

## Process sentences

In [7]:
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

In [8]:
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [9]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [10]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb'),encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'),encoding='latin1'))

In [11]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))


A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file c:\Dev\learning\data\glove\results already exists.
Error occurred while processing: c:\Dev\learning\data\glove\results.


In [12]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [13]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))

In [16]:
print(comments[0])

nonsens kiss geek said true ill account termin


In [17]:
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

## Vectorize words

In [18]:
tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
# tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(comments + test_comments))
comments_sequence = tokenizer.texts_to_sequences(comments)

In [19]:
test_comments_sequence = tokenizer.texts_to_sequences(test_comments)    
X_train = sequence.pad_sequences(comments_sequence , maxlen=maxlen)
Test_train = sequence.pad_sequences(test_comments_sequence, maxlen=maxlen)

In [20]:
emb_mean,emb_std = vecs.mean(), vecs.std()
emb_mean,emb_std


(0.020940498, 0.6441043)

In [21]:
word_index = tokenizer.word_index
nb_words = vecs.shape[0]
embed_size= vecs.shape[1]
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    vec_idx = wordidx.get(word)
    if vec_idx is not None:
            embedding_vector = vecs[vec_idx]
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
   

In [None]:
## model

In [23]:
model = Sequential([
   Embedding(nb_words, embed_size, input_length=maxlen, dropout=0.2, 
              weights=[embedding_matrix]),
    Convolution1D(128, 3, border_mode='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Convolution1D(128, 4, border_mode='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Convolution1D(128, 5, border_mode='same', activation='relu'),
    Dropout(0.25),
    MaxPooling1D(),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='softmax')])

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  import sys
  # Remove the CWD from sys.path while we load stuff.


In [24]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           20000000  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 128)          19328     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 50, 128)           65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 25, 128)           0         
__________

In [None]:
model.fit(X_train, labels_train,nb_epoch=2, batch_size=64)



Epoch 1/2
19776/95851 [=====>........................] - ETA: 8:35 - loss: 0.3246 - acc: 0.8454

In [None]:
preds = model.predict(Test_train)

In [None]:
print(preds.shape)

In [None]:
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission_textcnn.csv', index=False)