In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed, Dropout
from keras import backend as K
from keras import optimizers
from keras.models import Model
import nltk
import re
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from nltk import tokenize
import seaborn as sns

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
class AttentionLayer(Layer):
    """
    Hierarchial Attention Layer as described by Hierarchical Attention Networks for Document Classification(2016)
    - Yang et. al.
    Source: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf
    Theano backend
    """
    def __init__(self,attention_dim=100,return_coefficients=False,**kwargs):
        # Initializer 
        self.supports_masking = True
        self.return_coefficients = return_coefficients
        self.init = initializers.get('glorot_uniform') # initializes values with uniform distribution
        self.attention_dim = attention_dim
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Builds all weights
        # W = Weight matrix, b = bias vector, u = context vector
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)),name='W')
        self.b = K.variable(self.init((self.attention_dim, )),name='b')
        self.u = K.variable(self.init((self.attention_dim, 1)),name='u')
        self.trainable_weights = [self.W, self.b, self.u]

        super(AttentionLayer, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, hit, mask=None):
        # Here, the actual calculation is done
        uit = K.bias_add(K.dot(hit, self.W),self.b)
        uit = K.tanh(uit)
        
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)
        ait = K.exp(ait)
        
        if mask is not None:
            ait *= K.cast(mask, K.floatx())

        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = hit * ait
        
        if self.return_coefficients:
            return [K.sum(weighted_input, axis=1), ait]
        else:
            return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        if self.return_coefficients:
            return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[-1], 1)]
        else:
            return input_shape[0], input_shape[-1]

    

In [3]:
MAX_SENTENCE_NUM = 95
MAX_WORD_NUM = 55
MAX_FEATURES = 200000 
EMBED_SIZE = 100

In [4]:
df=pd.read_csv("fakebr.csv",header=None)

In [5]:
df.columns = ['text', 'category']

In [6]:
df = df[['text', 'category']]

In [7]:
categories = df['category']
text = df['text']

In [8]:
import re
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub("\\n", " ", string)
    string = re.sub('\+\xa0', " ", string)
    string = re.sub("\xa0", " ", string)
    #string = re.sub(r"\u", " ", string)
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)  

    return string.strip().lower()

In [9]:
paras = []
labels = []
texts = []

In [10]:
labels = pd.get_dummies(categories)

In [11]:
sent_lens = []
sent_nums = []
for idx in range(df.text.shape[0]):
    text = clean_str(df.text[idx])
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    sent_nums.append(len(sentences))
    for sent in sentences:
        sent_lens.append(len(text_to_word_sequence(sent)))
    paras.append(sentences)

In [12]:
tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token=True)
tokenizer.fit_on_texts(texts)

In [13]:
data = np.zeros((len(texts), MAX_SENTENCE_NUM, MAX_WORD_NUM), dtype='int64')
for i, sentences in enumerate(paras):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTENCE_NUM:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                try:
                    if k<MAX_WORD_NUM and tokenizer.word_index[word]<MAX_FEATURES:
                        data[i,j,k] = tokenizer.word_index[word]
                        k=k+1
                except:
                    print(word)
                    pass

In [14]:
import numpy as np
print(np.mean(sent_lens),2*np.std(sent_lens))
print(np.mean(sent_nums),2*np.std(sent_nums))

21.197674524355197 29.838632511480565
30.543333333333333 67.88308773177603


In [15]:
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

Total 98358 unique tokens.


In [16]:
print('Shape of data tensor:', data.shape)
print('Shape of labels tensor:', labels.shape)

Shape of data tensor: (7200, 95, 55)
Shape of labels tensor: (7200, 2)


In [17]:
GLOVE_DIR = "../embeddings/glove_s100.txt"
embeddings_index = {}
f = open(GLOVE_DIR)
for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(word)
        pass
f.close()
print('Total %s word vectors.' % len(embeddings_index))

r$
00
三藏法師玄奘奉
r$
Total 929594 word vectors.


In [18]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
absent_words = 0
for word, i in word_index.items():
        
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and len(embedding_vector) == 100:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        absent_words += 1
print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(word_index)), '% of total words')

Total absent words are 21097 which is 21.45 % of total words


In [19]:
def han():
    """
    Create Keras functional model for hierarchical attention network
    """
    embedding_layer = Embedding(len(word_index) + 1,EMBED_SIZE,weights=[embedding_matrix], input_length=MAX_WORD_NUM, trainable=False,name='word_embedding')

    # Words level attention model
    word_input = Input(shape=(MAX_WORD_NUM,), dtype='int32',name='word_input')
    word_sequences = embedding_layer(word_input)
    word_gru = Bidirectional(GRU(50, return_sequences=True),name='word_gru')(word_sequences)
    word_dense = Dense(100, activation='relu', name='word_dense')(word_gru) 
    word_att,word_coeffs = AttentionLayer(EMBED_SIZE,True,name='word_attention')(word_dense)
    wordEncoder = Model(inputs = word_input,outputs = word_att)

    # Sentence level attention model
    sent_input = Input(shape=(MAX_SENTENCE_NUM,MAX_WORD_NUM), dtype='int32',name='sent_input')
    sent_encoder = TimeDistributed(wordEncoder,name='sent_linking')(sent_input)
    sent_gru = Bidirectional(GRU(50, return_sequences=True),name='sent_gru')(sent_encoder)
    sent_dense = Dense(100, activation='relu', name='sent_dense')(sent_gru) 
    sent_att,sent_coeffs = AttentionLayer(EMBED_SIZE,return_coefficients=True,name='sent_attention')(sent_dense)
    sent_drop = Dropout(0.5,name='sent_dropout')(sent_att)
    preds = Dense(2, activation='softmax',name='output')(sent_drop)

    # Model compile
    model = Model(sent_input, preds)
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['categorical_accuracy'])
    print(wordEncoder.summary())
    print(model.summary())
    
    return model

In [20]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 0)

es = EarlyStopping(monitor = 'loss', min_delta = 1e-3, patience = 10) # Monitora a funcao de loss e se em 10 epocas ela nao melhorar em pelo menos 1e-3 o treinamento termina
rlr = ReduceLROnPlateau(monitor = 'loss', factor = 0.2, patience = 5) # Diminui a taxa de aprendizagem se ficar 5 epocas sem melhorar o loss
mcp = ModelCheckpoint(filepath = 'pesos.h5', monitor = 'loss', save_best_only = True, verbose = 1) # Salva o melhor modelo que encontrar

classificador = KerasClassifier(build_fn = han, epochs = 2, batch_size = 200)



In [21]:
#from sklearn.model_selection import cross_val_score


#resultados = cross_val_score(estimator = classificador,
#                             X = data, y = categories.values,
#                             cv =5, scoring = 'accuracy', 
#                             fit_params= {'callbacks': [es, rlr]}
#                            )

In [22]:
from keras.utils import to_categorical
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
labels_dummy = to_categorical(categories)

In [23]:
folds = list(skf.split(data, categories))
checkpoint = ModelCheckpoint('best_model.h5', verbose=0, monitor='val_loss',save_best_only=True, mode='auto')

In [24]:
res = []
acc = []
model = han()
model.save_weights('model.h5')
for j, (train_idx, val_idx) in enumerate(folds):
    
    model.load_weights('model.h5')
    
    print('\nFold ',j)
    X_train_cv = data[train_idx]
    y_train_cv = labels_dummy[train_idx]
    X_valid_cv = data[val_idx]
    y_valid_cv = labels_dummy[val_idx]
    
    history = model.fit(X_train_cv, y_train_cv, validation_data=(X_valid_cv, y_valid_cv), 
                        epochs = 100 , batch_size=400, callbacks=[checkpoint,es])
    
    y_pred = model.predict(X_valid_cv)
    
    res.append(precision_recall_fscore_support(y_valid_cv.argmax(axis=1), y_pred.argmax(axis=1)))
    acc.append(accuracy_score(y_valid_cv.argmax(axis=1), y_pred.argmax(axis=1)))
    
 






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_input (InputLayer)      (None, 55)                0         
_________________________________________________________________
word_embedding (Embedding)   (None, 55, 100)           9835900   
_________________________________________________________________
word_gru (Bidirectional)     (None, 55, 100)           45300     
_________________________________________________________________
word_dense (Dense)           (None, 55, 100)           10100     
_________________________________________________________________
word_attention (AttentionLay [(None, 100), (None, 100, 10200     
Total params: 9,901,500
Trainable params: 65,600
Non-trainable params: 9,835,900
_______________________________________________________

Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100

Fold  1
Train on 5760 samples, validate on 1440 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100

Fold  2
Train on 5760 samples, validate on 1440 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 81/100
Epoch 82/100


Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100

Fold  3
Train on 5760 samples, validate on 1440 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100


Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100

Fold  4
Train on 5760 samples, validate on 1440 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100


Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100


In [25]:
r = model.predict(X_valid_cv)

In [26]:
categories.values

array([0, 0, 0, ..., 1, 1, 1])

In [27]:
type(labels_dummy)

numpy.ndarray

In [28]:
resultados

NameError: name 'resultados' is not defined

In [None]:
for i in r.argmax(axis=1):
    print(i)

In [29]:
res

[(array([0.97075209, 0.96814404]),
  array([0.96805556, 0.97083333]),
  array([0.96940195, 0.96948682]),
  array([720, 720])),
 (array([0.95225102, 0.9688826 ]),
  array([0.96944444, 0.95138889]),
  array([0.96077082, 0.96005606]),
  array([720, 720])),
 (array([0.98041958, 0.9737931 ]),
  array([0.97361111, 0.98055556]),
  array([0.97700348, 0.97716263]),
  array([720, 720])),
 (array([0.9762901 , 0.97233748]),
  array([0.97222222, 0.97638889]),
  array([0.97425191, 0.97435897]),
  array([720, 720])),
 (array([0.96027397, 0.97323944]),
  array([0.97361111, 0.95972222]),
  array([0.96689655, 0.96643357]),
  array([720, 720]))]

In [30]:
resultados = pd.DataFrame(columns=['Precision', 'PrecisionT', 'Recall', 'RecallT', 'Fscore', 'FscoreT', 'Acc'])
for i in range(5):
    resultados = resultados.append({'Precision':res[i][0][0], 'PrecisionT':res[i][0][1], 
                                    'Recall':res[i][1][0], 'RecallT':res[i][1][1], 'Fscore':res[i][2][0], 
                                    'FscoreT':res[i][2][1], 'Acc':acc[i]},ignore_index=True)
    print(resultados)

   Precision  PrecisionT    Recall   RecallT    Fscore   FscoreT       Acc
0   0.970752    0.968144  0.968056  0.970833  0.969402  0.969487  0.969444
   Precision  PrecisionT    Recall   RecallT    Fscore   FscoreT       Acc
0   0.970752    0.968144  0.968056  0.970833  0.969402  0.969487  0.969444
1   0.952251    0.968883  0.969444  0.951389  0.960771  0.960056  0.960417
   Precision  PrecisionT    Recall   RecallT    Fscore   FscoreT       Acc
0   0.970752    0.968144  0.968056  0.970833  0.969402  0.969487  0.969444
1   0.952251    0.968883  0.969444  0.951389  0.960771  0.960056  0.960417
2   0.980420    0.973793  0.973611  0.980556  0.977003  0.977163  0.977083
   Precision  PrecisionT    Recall   RecallT    Fscore   FscoreT       Acc
0   0.970752    0.968144  0.968056  0.970833  0.969402  0.969487  0.969444
1   0.952251    0.968883  0.969444  0.951389  0.960771  0.960056  0.960417
2   0.980420    0.973793  0.973611  0.980556  0.977003  0.977163  0.977083
3   0.976290    0.972337 

In [31]:
resultados.to_csv("res.csv")