In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from mlxtend.classifier import StackingClassifier
import numpy as np
import pandas as pd
import pickle

In [50]:
class NeuralNetwork(object):
    def __init__(self,features):
        '''Constructor of the class
            Args:
                features(matrix) - feature matrix where the columns describe the features and the rows the samples
        '''
        self.features=features
        
    def set_labels(self,labels):
        '''Method which sets the labels, i.e. the targets we want to predict based on the features
            Args:
                labels(vector) - vector where the i-th element is the label which corresponds to the i-th row of the features
                matrix
        '''
        self.labels=labels
        # the code below implements one-hot encoding of the labels
        lb=LabelBinarizer()
        lb.fit([label[0] for label in labels])
        self.labels_one_hot=lb.transform(labels)

    def get_train_test_split(self,test_size=0.2):
        '''Method which splits the features and labels in a training set and a testing set
            Args:
                test_size(number) - defines the size of the test set
        '''
        #print(self.features,self.labels_one_hot)
        X_train,X_test,y_train,y_test= train_test_split(self.features,self.labels_one_hot,test_size=test_size)
        return (X_train,X_test,y_train,y_test)
        
    def get_batches(self,batch_size):
        '''Helper method to devide the features and labels in mini-batches
            Args:
                batch_size(number) - size of the batches e.g. 256 (should fit in memory of the machine)
        '''
        assert len(self.features) == len(self.labels)
        output_batches = []
    
        sample_size = len(self.features)
        for start_i in range(0, sample_size, batch_size):
            end_i = start_i + batch_size
            batch = [self.features[start_i:end_i], self.labels_one_hot[start_i:end_i]]
            output_batches.append(batch)
        
        return output_batches
    
    def build_neural_net(self,n_hidden_nodes):
        '''Method which builds the Neuronal Network in TensorFlow
            Args:
                n_hidden_nodes(number) - number of hidden notes of the network
            
        '''
        self.n_hidden_nodes=n_hidden_nodes
        self.n_features=40000
        self.n_labels=len(self.labels_one_hot[0])
        
        self.x=tf.placeholder(dtype=tf.float32,shape=[None,self.n_features])
        self.y=tf.placeholder(dtype=tf.float32,shape=[None,self.n_labels])
        
        self.w1=tf.Variable(tf.truncated_normal([self.n_features,self.n_hidden_nodes]))
        self.w2=tf.Variable(tf.truncated_normal([self.n_hidden_nodes,self.n_labels]))
        self.b1=tf.Variable(tf.zeros([self.n_hidden_nodes]))
        self.b2=tf.Variable(tf.zeros([self.n_labels]))
        
        h1=tf.matmul(self.x,self.w1)+self.b1
        a1=tf.nn.relu(h1)
        
        h2=tf.matmul(a1,self.w2)+self.b2
        a2=tf.nn.relu(h2)
        
        self.output=tf.nn.softmax(a2)
        self.prediction = tf.argmax(self.output,1)
        self.correct_prediction = tf.equal(self.prediction, tf.argmax(self.y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
        
        self.cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.output,labels=self.y))
        
    def train_model(self,learning_rate,epochs,batch_size,test_size=0.2):
        '''Method which trains the model
            Args:
                learning_rate(number) - learning rate used in gradient descent
                epochs(number) - number of iterations (epochs) used in gradient descent
                batch_size(number) - size of the mini batches used in training
                test_size(number) - size of the test set
        '''
        self.lr=learning_rate
        self.optimizer=tf.train.GradientDescentOptimizer(learning_rate=self.lr).minimize(self.cost)
        X_train,X_test,y_train,y_test=self.get_train_test_split(test_size)
        batches=self.get_batches(batch_size)

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            
            for epoch in range(epochs):
                for X_batch,y_batch in batches:
                    sess.run(self.optimizer,feed_dict={self.x:X_batch,self.y:y_batch})
                cost_train=sess.run(self.cost,feed_dict={self.x:X_train,self.y:y_train})
                accuracy_test=sess.run(self.accuracy,feed_dict={self.x:X_test,self.y:y_test})
                accuracy_train=sess.run(self.accuracy,feed_dict={self.x:X_train,self.y:y_train})
                print("In epoch {} is the cost equals {}".format(epoch,cost_train))
                print("In epoch {} is the accuracy on the training set equals {}".format(epoch,accuracy_train))
                print("In epoch {} is the accuracy on the test set equals {}".format(epoch,accuracy_test)) 

In [4]:
train = pickle.load(open('train.p','rb'))
test = pickle.load(open('test.p','rb'))

train_text = train['comment_text']
test_text = test
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 4),
    max_features=40000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
print('word TFIDF')

'''char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 4),
    max_features=40000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
print('char TFIDF')'''


word TFIDF


"char_vectorizer = TfidfVectorizer(\n    sublinear_tf=True,\n    strip_accents='unicode',\n    analyzer='char',\n    ngram_range=(1, 4),\n    max_features=40000)\nchar_vectorizer.fit(all_text)\ntrain_char_features = char_vectorizer.transform(train_text)\ntest_char_features = char_vectorizer.transform(test_text)\nprint('char TFIDF')"

In [36]:
features=train_word_features
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate','clean']
features

<159571x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 14083710 stored elements in Compressed Sparse Row format>

In [44]:
network=NeuralNetwork(features)

In [45]:
network.set_labels(labels)

In [46]:
network.build_neural_net(3)

In [49]:
network.train_model(learning_rate=0.9,epochs=7,batch_size=2)

  (0, 39810)	0.132426083394
  (0, 38674)	0.118646594328
  (0, 38606)	0.0720239554381
  (0, 37803)	0.115209148362
  (0, 37549)	0.0953624053108
  (0, 37453)	0.120503226774
  (0, 36974)	0.150316844476
  (0, 36963)	0.0916308549876
  (0, 34228)	0.129325790864
  (0, 34213)	0.108455589355
  (0, 34212)	0.106938262379
  (0, 33110)	0.105994943377
  (0, 32601)	0.0616612247364
  (0, 31879)	0.173571140895
  (0, 31872)	0.105219812606
  (0, 31730)	0.0780626377645
  (0, 31715)	0.0642369228001
  (0, 30328)	0.0665750210564
  (0, 30014)	0.126745521365
  (0, 30013)	0.104186004288
  (0, 30005)	0.0841346115241
  (0, 28690)	0.112377445164
  (0, 28666)	0.0866868354711
  (0, 28645)	0.153010188623
  (0, 28410)	0.100287767968
  :	:
  (159570, 15528)	0.151561627548
  (159570, 15324)	0.0662604535785
  (159570, 14868)	0.17887046985
  (159570, 14863)	0.120196039822
  (159570, 14859)	0.0728076336854
  (159570, 14770)	0.088156210304
  (159570, 14130)	0.0428512295895
  (159570, 13647)	0.167589119976
  (159570, 13610)	0

ValueError: Found input variables with inconsistent numbers of samples: [159571, 7]

In [34]:
network.features#,self.labels_one_hot

<159571x40000 sparse matrix of type '<class 'numpy.float64'>'
	with 14083710 stored elements in Compressed Sparse Row format>

In [35]:
network.labels_one_hot

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [53]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D,Bidirectional
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import matplotlib.pyplot as plt
%matplotlib inline
import gensim.models.keyedvectors as word2vec
import gc



In [57]:
train = pickle.load(open('train.p','rb'))
test = pickle.load(open('test.p','rb'))

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"]
list_sentences_test = test


In [58]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [59]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [69]:
def loadEmbeddingMatrix(typeToLoad):
        #load different embedding file from Kaggle depending on which embedding 
        #matrix we are going to experiment with
        if(typeToLoad=="glove"):
            EMBEDDING_FILE='../input/glove-twitter/glove.twitter.27B.25d.txt'
            embed_size = 25
        elif(typeToLoad=="word2vec"):
            word2vecDict = word2vec.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
            embed_size = 300
        elif(typeToLoad=="fasttext"):
            EMBEDDING_FILE='../input/fasttext/wiki.simple.vec'
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            #Transfer the embedding weights into a dictionary by iterating through every line of the file.
            f = open(EMBEDDING_FILE)
            for line in f:
                #split up line into an indexed array
                values = line.split()
                #first index is word
                word = values[0]
                #store the rest of the values in the array as a new array
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs #50 dimensions
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))
        else:
            embeddings_index = dict()
            for word in word2vecDict.wv.vocab:
                embeddings_index[word] = word2vecDict.word_vec(word)
            print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        #We get the mean and standard deviation of the embedding weights so that we could maintain the 
        #same statistics for the rest of our own random generated weights. 
        all_embs = np.stack(list(embeddings_index.values()))
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        
        nb_words = len(tokenizer.word_index)
        #We are going to set the embedding size to the pretrained dimension as we are replicating it.
        #the size will be Number of Words in Vocab X Embedding Size
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        gc.collect()

        #With the newly created embedding matrix, we'll fill it up with the words that we have in both 
        #our own dictionary and loaded pretrained embedding. 
        embeddedCount = 0
        for word, i in tokenizer.word_index.items():
            i-=1
            #then we see if this word is in glove's dictionary, if yes, get the corresponding weights
            embedding_vector = embeddings_index.get(word)
            #and store inside the embedding matrix that we will train later on.
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
                embeddedCount+=1
        print('total embedded:',embeddedCount,'common words')
        
        del(embeddings_index)
        gc.collect()
        
        #finally, return the embedding matrix
        return embedding_matrix

In [70]:
embedding_matrix = loadEmbeddingMatrix('word2vec')

ValueError: not enough values to unpack (expected 2, got 0)