# Tweet hate speech classifier

In [1]:
import numpy  as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, matthews_corrcoef

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, BatchNormalization, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


## Load the data

In [2]:
train = pd.read_csv( "train_E6oV3lV.csv" )
test = pd.read_csv( "test_tweets_anuFYb8.csv" )

In [3]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


## Define some useful functions

In [4]:
def uniqueWords( data ):
    """Return a set of the words in data."""

    words = set([])

    for tweet in data:
        for w in tweet.split():
            words.add(w)

    return words

def uniqueChars( data ):
    """Return a set of the characters in data."""

    chars = set([])

    for tweet in data:
        for c in tweet:
            chars.add(c)

    return chars

def tweetLens( data ):
    """Find the lengths of all the tweets in the data set."""

    lens = []

    for t in data:
        lens.append( len(t) )

    return lens

def genDicts( chars ):
    """Generate dictionaries used to encode/decode the tweets."""

    c2i = { c : i + 1 for i, c in enumerate( sorted(chars) ) }
    i2c = { i + 1 : c for i, c in enumerate( sorted(chars) ) }

    return c2i, i2c

def encode( tweet, c2i ):
    """Encode the tweet into a list of integers."""

    code = []

    for c in tweet:
        code.append( c2i[c] )

    return code

def decode( code, i2c ):
    """Decode a list of integers into a tweet."""

    sentence = []

    for i in code:
        sentence.append( i2c[i] )

    return "".join(sentence)

def padList( l, p, n ):
    """Add padding p to the list l such that its length becomes n."""

    while ( len(l) < n ):
        l.append(p)

    return l

def genData( data, c2i ):
    """Encode/pad the data set."""

    X = []

    for t in data.tweet:
        code = encode( t, c2i )
        code = padList( code, 0, 300 )

        X.append(code)

    return np.array(X), np.array( data["label"] )

## Enumerate the unique characters present in the data set

In [5]:
w1 = uniqueWords( test.tweet )
w2 = uniqueWords( train.tweet )

c1 = uniqueChars( test.tweet )
c2 = uniqueChars( train.tweet )

l1 = tweetLens( test.tweet )
l2 = tweetLens( train.tweet )

In [6]:
chars = set.union(c1,c2)

In [7]:
c2i, i2c = genDicts( chars )

## Train the model

In [8]:
X, y = genData( train, c2i )

trainX, valX, trainY, valY = train_test_split( X, y, test_size = 0.1, random_state = 1 )

In [9]:
def genModel( embed = 32, nl = 1, nh = 32, do = 0.5, rdo = 0.5 ):

    model = Sequential()
    model.add( Embedding( input_dim = 167, output_dim = embed, input_length = 300, mask_zero = True ) )
    model.add( BatchNormalization() )

    for _ in range(nl - 1):
        model.add( Bidirectional( LSTM( nh, dropout = do, recurrent_dropout = rdo, return_sequences = True ) ) )

    model.add( Bidirectional( LSTM( nh, dropout = do, recurrent_dropout = rdo ) ) )
    model.add( BatchNormalization() )
    model.add( Dense(1, activation = "sigmoid") )

    model.compile( loss = "binary_crossentropy", optimizer = "adam", metrics = ['acc'] )

    return model

In [10]:
#model = genModel( embed = 64, nl = 2, nh = 128 ) #0.07193
model = genModel( embed = 64, nl = 3, nh = 64 )

model.summary()

earlyStoper  = EarlyStopping( patience = 10, verbose = 1 )
checkPointer = ModelCheckpoint( filepath = "best.hdf5", save_best_only = True, verbose = 1 )

hist = model.fit( trainX, trainY,
                  validation_data = (valX, valY),
                  epochs = 5000,
                  #verbose = 0,
                  batch_size = 512,
                  callbacks = [earlyStoper, checkPointer] )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 64)           10688     
_________________________________________________________________
batch_normalization_1 (Batch (None, 300, 64)           256       
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300, 128)          66048     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 300, 128)          98816     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
batch_normalization_2 (Batch (None, 128)               512       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total para

Epoch 25/5000
Epoch 00025: val_loss did not improve
Epoch 26/5000
Epoch 00026: val_loss did not improve
Epoch 27/5000
Epoch 00027: val_loss improved from 0.10311 to 0.10232, saving model to best.hdf5
Epoch 28/5000
Epoch 00028: val_loss improved from 0.10232 to 0.10044, saving model to best.hdf5
Epoch 29/5000
Epoch 00029: val_loss improved from 0.10044 to 0.09717, saving model to best.hdf5
Epoch 30/5000
Epoch 00030: val_loss improved from 0.09717 to 0.09700, saving model to best.hdf5
Epoch 31/5000
Epoch 00031: val_loss did not improve
Epoch 32/5000
Epoch 00032: val_loss did not improve
Epoch 33/5000
Epoch 00033: val_loss did not improve
Epoch 34/5000
Epoch 00034: val_loss improved from 0.09700 to 0.09675, saving model to best.hdf5
Epoch 35/5000
Epoch 00035: val_loss did not improve
Epoch 36/5000
Epoch 00036: val_loss improved from 0.09675 to 0.09651, saving model to best.hdf5
Epoch 37/5000
Epoch 00037: val_loss improved from 0.09651 to 0.09212, saving model to best.hdf5
Epoch 38/5000
Ep

Epoch 84/5000
Epoch 00084: val_loss did not improve
Epoch 85/5000
Epoch 00085: val_loss did not improve
Epoch 86/5000
Epoch 00086: val_loss did not improve
Epoch 87/5000
Epoch 00087: val_loss did not improve
Epoch 88/5000
Epoch 00088: val_loss did not improve
Epoch 89/5000
Epoch 00089: val_loss did not improve
Epoch 90/5000
Epoch 00090: val_loss did not improve
Epoch 91/5000
Epoch 00091: val_loss improved from 0.07008 to 0.06990, saving model to best.hdf5
Epoch 92/5000
Epoch 00092: val_loss did not improve
Epoch 93/5000
Epoch 00093: val_loss did not improve
Epoch 94/5000
Epoch 00094: val_loss did not improve
Epoch 95/5000
Epoch 00095: val_loss did not improve
Epoch 96/5000
Epoch 00096: val_loss did not improve
Epoch 97/5000
Epoch 00097: val_loss did not improve
Epoch 98/5000
Epoch 00098: val_loss did not improve
Epoch 99/5000
Epoch 00099: val_loss improved from 0.06990 to 0.06952, saving model to best.hdf5
Epoch 100/5000
Epoch 00100: val_loss did not improve
Epoch 101/5000
Epoch 00101:

## Evaluate the model on the validation set

In [11]:
model.load_weights( "best.hdf5" )
pred = model.predict( valX )

In [12]:
f1   = f1_score( valY, np.round(pred.flatten()) )
acc  = accuracy_score( valY, np.round(pred.flatten()) )
rec  = recall_score( valY, np.round(pred.flatten()) )
prec = precision_score( valY, np.round(pred.flatten()) )
mcc  = matthews_corrcoef( valY, np.round(pred.flatten()) )

print( "valAcc:    {0:1.4f}\nRecall:    {2:1.4f}\nPrecision: {3:1.4f}\nF1:        {1:1.4f}\nMCC:       {4:1.4f}".format( acc, f1, rec, prec, mcc) )

valAcc:    0.9790
Recall:    0.8541
Precision: 0.8578
F1:        0.8559
MCC:       0.8446


## Make inferences on the test data

In [13]:
test = pd.read_csv( "test_tweets_anuFYb8.csv" )
test["label"] = 0

testData, _ = genData( test, c2i )

In [14]:
pred = model.predict( testData )
test["label"] = np.round( pred.flatten() )
test[ ["id", "label"] ].to_csv( "submission.csv", index = False )

According to the [checker](https://datahack.analyticsvidhya.com/contest/practice-problem-twitter-sentiment-analysis/) this model achieves an $F_1$ on the test set of:

$$F_1 = 0.8311688312.$$