# Set constant

In [1]:
max_len = 20 # max sentence size

# Load libaries

In [2]:
from freq_utils import *

import regex as re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical

from joblib import dump

pd.options.display.max_colwidth = 200

# Load dataset

In [3]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

# Make dictionaries

### Pretrained word embeddings
- Word to index
- Word to vector

In [4]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [5]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(pos_to_index)

{'wdt': 0, 'jj': 1, 'jjs': 2, "''": 3, 'rp': 4, 'fw': 5, ':': 6, 'vbg': 7, 'jjr': 8, 'sym': 9, 'prp$': 10, 'wp': 11, 'dt': 12, 'pos': 13, 'nnp': 14, 'nns': 15, 'ex': 16, 'in': 17, 'vbp': 18, 'pdt': 19, 'md': 20, 'vbn': 21, 'wrb': 22, 'vbd': 23, 'cc': 24, 'nnps': 25, 'rbr': 26, 'prp': 27, 'uh': 28, 'vb': 29, 'nn': 30, 'wp$': 31, 'rb': 32, 'rbs': 33, '$': 34, '.': 35, 'vbz': 36, 'cd': 37, 'to': 38}


In [6]:
len(pos_to_index)

39

# Train/dev/test split

In [7]:
train, dev, test = train_dev_test_split([df0, df1], m=10000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

# Define models

In [8]:
def simple_Manual(test):
    
    y_true = test.label
    y_pred = []
    
    # title size > 20?
    y_pred.append( test.apply(lambda row: 1 if len(row['lower_title'].split())> 20 else 0, axis=1) )
    # noise > 3 
    y_pred.append( test.apply(lambda row: 1 if len(re.findall(re.compile('[^\s\w]'), row['lower_title'])) > 5 else 0, axis=1) )
    # clickbait, slang, first names
    trigger_word = ['_mytag_parentheses_', '_mytag_slang_',
                    'donald','obama','hillary','bernie']

    for i in range(len(trigger_word)):
        y_pred.append( test.minimal_words.str.contains(trigger_word[i])*1 )

    cut_name = ['too_long','noisy','clickbait','slang'] + trigger_word[-4:]

    return cut_name, y_true, y_pred

In [9]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname]#.tolist() -> commented out to keep dataframe index
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    model_name = 'Naive Bayes - '+Xname
    
    
    return model_name, predict, Y_test, proba, classifier, counter

In [10]:
def model_FNN(input_shape, word_to_index, word_to_vector, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_index: word to index dictionary
    word_to_vector: word to embedding vector dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')
    
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
    else:
        # One hot encoding (=input)
        # Output: (m, max_len, num_cat)
        X = tf.dtypes.cast(X, tf.float32)    

    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)
    
    # Take average
    max_len = X.shape[1]
    # Make a list from slice
    X_avg = [ X[:,i,:] for i in range(max_len) ]
    # Take average of a sentence
    X = tf.keras.layers.Average()(X_avg)    
        
    # Linear+ReLu layer
    X = tfl.Dense(units = 128, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X)  

    # Linear+ReLu layer
    X = tfl.Dense(units = 64, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X) 

    # Linear+ReLu layer
    X = tfl.Dense(units = 32, activation='relu')(X)
    X = tfl.Dropout(rate = 0.2)(X) 

    # Linear+Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Dense(units = n_class, activation='softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

In [11]:
def model_LSTM(input_shape, word_to_index, word_to_vector=False, n_class=2, trainable=False):
    '''
    input_shape: (max_len,) or (max_len, num_cat)
    word_to_index: word to index dictionary, False for one hot encoding
    word_to_vector: word to embedding vector dictionary, False for one hot encoding

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_oh (m, max_len, num_cat)
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    #embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
    #X = embedding_layer(X_indices)   

    # By default, assumes one hot vector input
    # If word_to_index, word_to_vector is provided, add an embedding layer
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
        # Masking layer
        # skip zero vector words
        X = tfl.Masking(mask_value=0.)(X)
        
    else:
        X = tf.dtypes.cast(X, tf.float32)

    # LSTM layer
    # Output: a[1] (m, max_len, 128 hidden unit), batch of sequences
    X = tfl.LSTM(units = 128, return_sequences= True)(X)
    X = tfl.Dropout(rate = 0.5 )(X) 

    # LSTM layer
    # Output: a[2]<max_len> (m, 128 hidden unit)
    X = tfl.LSTM(units = 128)(X)
    X = tfl.Dropout(rate = 0.5)(X)  

    # Linear layer
    # Output: a[3] (m, # classes)
    X = tfl.Dense(units = n_class)(X)

    # Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Activation('softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

# Model wrapper functions

In [12]:
def run_multiple_NB(model_vars,train,dev,test,Yname='label'):

    results = []
    imodel = 0
    
    for var in model_vars:
        
        Xname = var
        
        model_name, y_pred, y_true, proba, classifier, counter = \
            simple_NB(train,dev,test,Xname=Xname,Yname=Yname)
        
        x = test[Xname].to_numpy()
        
        model_name = model_name+' '+Xname
        
        
        save_name = 'data/nb'+str(imodel)
        dump(classifier,save_name)
        save_name = 'data/nb_counter'+str(imodel)
        dump(counter,save_name)
        
        
        results.append([model_name, x, y_pred, y_true, proba, classifier, counter])
        
        imodel+=1
        
    return results

In [13]:
def run_multiple_NN(model_vars, train, dev, test, pos_to_index, word_to_index=False, word_to_vector=False, 
                        Yname='label', max_len=20, n_class=2, 
                        epochs = 20, batch_size = 32, patience=2, trainable=False):
    
    
    results = []

    for i in range(len(model_vars)):
            
        model_name = model_vars[i][0]
        func_model = model_vars[i][1]
        Xname = model_vars[i][2]
        use_embeddings = model_vars[i][3]
        
        model_name = model_name+' '+Xname
        
        w2i = False
        w2v = False
        X_shape = False
        
        if use_embeddings:
            w2i = word_to_index        
            w2v = word_to_vector
            X_shape = (max_len, )
        else:
            w2i = pos_to_index
            X_shape = (max_len, len(pos_to_index))

        _, _, X_train_indices, _, Y_train_oh = dataframe_to_arrays(train, w2i, max_len, Xname=Xname)
        _, _, X_dev_indices,   _, Y_dev_oh   = dataframe_to_arrays(dev, w2i, max_len, Xname=Xname)
        index, _, X_test_indices, Y_test, _  = dataframe_to_arrays(test, w2i, max_len, Xname=Xname)

        
        X = False
        if use_embeddings:
            X = [X_train_indices, X_dev_indices, X_test_indices]
        else:
            X = [to_categorical(X_train_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_dev_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_test_indices, num_classes=len(pos_to_index))]            

        
        Y = [Y_train_oh, Y_dev_oh, Y_test]

        
    
        print('should match all: ', X_shape, X[0].shape, X[1].shape, X[2].shape)
    
        model = func_model(X_shape, w2i, w2v, n_class, trainable)
        model.summary()
                    
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)  
        
        history = False
        
        if patience :
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]),
                                callbacks=[es])
        else:
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]))
            
        proba = model.predict(X[2])
        y_pred = [np.argmax(proba[i]) for i in range(len(proba))]
        y_true = Y[2]
        x = X[2]
        
        save_name = 'data/nn'+str(i)
        model.save(save_name)
        save_name = 'data/nn_history'+str(i)
        dump(history,save_name)
        
        results.append([model_name, x, y_pred, y_true, proba, model, history])
    
    return results

# Run models

In [14]:
seq_type = ['Original','Lower','CleanedWords','PoS','MinimalWords']
ml_type = ['NaiveBayes','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words' ]


nn_vars = []
for i in range(n_seq):
    title = title_vars[i]
    oh = True
    if title=='cleaned_pos':
        oh=False
    nn_vars.append(['FNN',model_FNN,title,oh])
        
for i in range(n_seq):
    title = title_vars[i]
    oh = True
    if title=='cleaned_pos':
        oh=False
    nn_vars.append(['LSTM',model_LSTM,title,oh])
    



In [15]:
# Naive Bayes
res_nb =  run_multiple_NB(title_vars,train,dev,test,Yname='label')
    

# Neural Networks
for i in range(len(nn_vars)):
    print(nn_vars[i])

res_nn = run_multiple_NN(nn_vars, train, dev, test,
                         pos_to_index, word_to_index, word_to_vector, 
                         Yname='label', max_len=max_len, n_class=2,
                         epochs = 50, batch_size = 32, patience=4, trainable=True)

# Add NB and NN
results = res_nb + res_nn

['FNN', <function model_FNN at 0x7fca4c4e3a60>, 'org_title', True]
['FNN', <function model_FNN at 0x7fca4c4e3a60>, 'lower_title', True]
['FNN', <function model_FNN at 0x7fca4c4e3a60>, 'cleaned_words', True]
['FNN', <function model_FNN at 0x7fca4c4e3a60>, 'cleaned_pos', False]
['FNN', <function model_FNN at 0x7fca4c4e3a60>, 'minimal_words', True]
['LSTM', <function model_LSTM at 0x7fca34427160>, 'org_title', True]
['LSTM', <function model_LSTM at 0x7fca34427160>, 'lower_title', True]
['LSTM', <function model_LSTM at 0x7fca34427160>, 'cleaned_words', True]
['LSTM', <function model_LSTM at 0x7fca34427160>, 'cleaned_pos', False]
['LSTM', <function model_LSTM at 0x7fca34427160>, 'minimal_words', True]
should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)


2022-03-08 16:47:39.220851: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 50)       20000050    ['input_1[0][0]']                
                                                                                                  
 masking (Masking)              (None, 20, 50)       0           ['embedding[0][0]']              
                                                                                                  
 tf.__operators__.getitem (Slic  (None, 50)          0           ['masking[0][0]']                
 ingOpLambda)                                                                                 

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 00008: early stopping


2022-03-08 16:53:09.386231: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: data/nn0/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


INFO:tensorflow:Assets written to: ram://3b43630d-a3d5-4b71-86e8-2fce7b64a95c/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 20, 50)       20000050    ['input_2[0][0]']                
                                                                                                  
 masking_1 (Masking)            (None, 20, 50)       0           ['embedding_1[0][0]']            
                                                                                                  
 tf.__operators__.getitem_20 (S  (None, 50)          0           ['masking_1[0][0]']              
 licingOpLambda)                  

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 00008: early stopping
INFO:tensorflow:Assets written to: data/nn1/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


INFO:tensorflow:Assets written to: ram://b9581d9b-c79c-47c0-ad2e-7ff8637a8c36/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 20, 50)       20000050    ['input_3[0][0]']                
                                                                                                  
 masking_2 (Masking)            (None, 20, 50)       0           ['embedding_2[0][0]']            
                                                                                                  
 tf.__operators__.getitem_40 (S  (None, 50)          0           ['masking_2[0][0]']              
 licingOpLambda)                  

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 00008: early stopping
INFO:tensorflow:Assets written to: data/nn2/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


INFO:tensorflow:Assets written to: ram://ff40599f-c8b8-42cd-8d7c-c731d73b6334/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


should match all:  (20, 39) (6000, 20, 39) (2000, 20, 39) (2000, 20, 39)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 20, 39)]     0           []                               
                                                                                                  
 tf.cast (TFOpLambda)           (None, 20, 39)       0           ['input_4[0][0]']                
                                                                                                  
 masking_3 (Masking)            (None, 20, 39)       0           ['tf.cast[0][0]']                
                                                                                                  
 tf.__operators__.getitem_60 (S  (None, 39)          0           ['masking_3[0][0]']              
 licingOpLambda)   

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 00020: early stopping
INFO:tensorflow:Assets written to: data/nn3/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


INFO:tensorflow:Assets written to: ram://daf97fd8-4bca-42d6-b0b0-8430ee8090fa/assets
should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 20, 50)       20000050    ['input_5[0][0]']                
                                                                                                  
 masking_4 (Masking)            (None, 20, 50)       0           ['embedding_3[0][0]']            
                                                                                                  
 tf.__operators__.getitem_80 (S  (None, 50)          0           ['masking_4[0][0]']              
 licingOpLambda)                                                                            

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 00009: early stopping
INFO:tensorflow:Assets written to: data/nn4/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


INFO:tensorflow:Assets written to: ram://7fd33e07-abfd-4f21-8593-789509ac4ff6/assets


  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)
Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_5 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm (LSTM)                 (None, 20, 128)           91648     
                                                                 
 dropout_15 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 



INFO:tensorflow:Assets written to: data/nn5/assets


INFO:tensorflow:Assets written to: data/nn5/assets


INFO:tensorflow:Assets written to: ram://962e7c9f-872a-4683-9b9d-4d1a53cfb828/assets


INFO:tensorflow:Assets written to: ram://962e7c9f-872a-4683-9b9d-4d1a53cfb828/assets


should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)
Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_5 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_6 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm_2 (LSTM)               (None, 20, 128)           91648     
                                                                 
 dropout_17 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584    
                                                                 
 



INFO:tensorflow:Assets written to: data/nn6/assets


INFO:tensorflow:Assets written to: data/nn6/assets


INFO:tensorflow:Assets written to: ram://ea051902-060e-4d72-82ee-de440c15c96f/assets


INFO:tensorflow:Assets written to: ram://ea051902-060e-4d72-82ee-de440c15c96f/assets


should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)
Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_6 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_7 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm_4 (LSTM)               (None, 20, 128)           91648     
                                                                 
 dropout_19 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_5 (LSTM)               (None, 128)               131584    
                                                                 
 



INFO:tensorflow:Assets written to: data/nn7/assets


INFO:tensorflow:Assets written to: data/nn7/assets


INFO:tensorflow:Assets written to: ram://3b1dad08-05e3-451e-9aa7-a453859fe073/assets


INFO:tensorflow:Assets written to: ram://3b1dad08-05e3-451e-9aa7-a453859fe073/assets


should match all:  (20, 39) (6000, 20, 39) (2000, 20, 39) (2000, 20, 39)
Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 20, 39)]          0         
                                                                 
 tf.cast_1 (TFOpLambda)      (None, 20, 39)            0         
                                                                 
 lstm_6 (LSTM)               (None, 20, 128)           86016     
                                                                 
 dropout_21 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_7 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_22 (Dropout)        (None, 128)               0         
                                                    



INFO:tensorflow:Assets written to: data/nn8/assets


INFO:tensorflow:Assets written to: data/nn8/assets


INFO:tensorflow:Assets written to: ram://76e2ef36-d2ce-490d-8920-d59e117aa2b4/assets


INFO:tensorflow:Assets written to: ram://76e2ef36-d2ce-490d-8920-d59e117aa2b4/assets


should match all:  (20,) (6000, 20) (2000, 20) (2000, 20)
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 20)]              0         
                                                                 
 embedding_7 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_8 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm_8 (LSTM)               (None, 20, 128)           91648     
                                                                 
 dropout_23 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_9 (LSTM)               (None, 128)               131584    
                                                                 
 



INFO:tensorflow:Assets written to: data/nn9/assets


INFO:tensorflow:Assets written to: data/nn9/assets


INFO:tensorflow:Assets written to: ram://d6cbdd36-a7b3-4478-a6a4-0f6af00d2aad/assets


INFO:tensorflow:Assets written to: ram://d6cbdd36-a7b3-4478-a6a4-0f6af00d2aad/assets


# Print result

In [16]:
def print_result(results):
    
    for result in results:
        
        model_name, x, y_pred, y_true, proba = result[:5]
        
        y_true = y_true.to_numpy()
        
        print(model_name)
        print('accuracy: ',accuracy_score(y_true, y_pred))
        print('precision: ',precision_score(y_true, y_pred))
        print('recall: ',recall_score(y_true, y_pred))
        print('f1: ',f1_score(y_true, y_pred))
        print('\n')

# Organize results into a dataframe

In [17]:
df_result = []

for i in range(len(results)):

    df = pd.DataFrame({
        'x': results[i][1].tolist(),
        'y_true': results[i][3].to_numpy().tolist(),
        'y_pred': results[i][2],
        'proba0': [results[i][4][j][0] for j in range(len(results[i][1])) ],
        'proba1': [results[i][4][j][1] for j in range(len(results[i][1])) ]
    }, index = results[i][3].index)
    df_result.append(df)

In [18]:
ml_type = ['NaiveBayes','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words']

dict_name = []

for i in range(n_ml):
    for j in range(n_seq):
        
        y_pred = ml_type_abb[i]+'_'+seq_type_abb[j]
        proba0 = y_pred+'_p0'
        proba1 = y_pred+'_p1'
        temp_dict = {'y_pred':y_pred,'proba0':proba0,'proba1':proba1}
        
        if i==0:
            x = title_vars[j]
            temp_dict['x'] = x
        
        dict_name.append(temp_dict)

df = []

df.append(df_result[0].rename(columns = dict_name[0]))
for i in range(1,n_seq):
    df.append(df_result[i].drop(['y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))

for i in range(n_seq, n_ml*n_seq):
    df.append(df_result[i].drop(['x','y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))


In [19]:
df_x = df[0][title_vars[0]].to_frame()
for i in range(1,n_seq):
    df_x = pd.merge(df_x, df[i][title_vars[i]].to_frame(), left_index=True, right_index=True)
    #print(i,len(df_x))

df_y = df[0].y_true.to_frame() 
print('y',len(df_y))
    
df[0].drop(['y_true'], axis=1, inplace=True)
for i in range(0,n_seq):
    df[i].drop([title_vars[i]], axis=1, inplace=True)

df_p = df[0]    
for i in range(1,n_seq*n_ml):
    df_p = pd.merge(df_p,df[i], left_index=True, right_index=True)
    
df_ml = pd.merge(df_x, df_y, left_index=True, right_index=True)
df_ml = pd.merge(df_ml, df_p, left_index=True, right_index=True)

y 2000


In [20]:
cut_name, y_true_cut, y_pred_cut = simple_Manual(test)

In [21]:
dict_manual = {cut_name[i]:y_pred_cut[i] for i in range(len(cut_name))}

df_manual = pd.DataFrame(dict_manual)

#display(df_manual)

In [22]:
df_final = pd.merge(df_ml, df_manual, left_index=True, right_index=True)


In [23]:
#display(df_ml)
#display(df_final)
display(df_final.sample(10))

print(df_final.columns)

df_final.to_csv('data/model_compare3.csv',index=False)

Unnamed: 0,org_title,lower_title,cleaned_words,cleaned_pos,minimal_words,y_true,nb_og,nb_og_p0,nb_og_p1,nb_lo,...,lstm_mw_p0,lstm_mw_p1,too_long,noisy,clickbait,slang,donald,obama,hillary,bernie
6174,Report: Black Women Have Become The Most Educated Group In The U.S. (VIDEO),report: black women have become the most educated group in the u.s. (video),report : black women have become the most educated group in the u . s . _mytag_parentheses_,NN : NNP NNP VBP VBN DT NNP NNP NNP IN DT NNP NN,report black woman become educated group _mytag_parentheses_,1,1,6.999231e-07,0.999999,1,...,0.975635,0.024365,0,0,1,0,0,0,0,0
7606,FL Pastor Targets Neighbor With Nazi Flag Display Over Boating Laws (IMAGE),fl pastor targets neighbor with nazi flag display over boating laws (image),fl pastor targets neighbor with nazi flag display over boating laws _mytag_parentheses_,NNP NNP NNPS NNP IN NNP NNP NNP IN NNP NNP NN,pastor target neighbor nazi flag display boating law _mytag_parentheses_,1,1,0.001800566,0.998199,1,...,0.019302,0.980698,0,0,1,0,0,0,0,0
3041,Individual travel scrapped under Trump's new Cuba policy,individual travel scrapped under trump's new cuba policy,individual travel scrapped under trump s new cuba policy,NNP NN VBD IN NNP POS JJ NNP NN,individual travel scrap trump new cuba policy,0,0,0.9975361,0.002464,0,...,0.999903,9.7e-05,0,0,0,0,0,0,0,0
8361,THE VIEW WOMEN Go Off The Rails: Trump ‘has to step down before the inauguration’ [Video],the view women go off the rails: trump ‘has to step down before the inauguration’ [video],the view women go off the rails : trump has to step down before the inauguration _mytag_parentheses_,DT NNP NNP NNP IN DT NNS : NNP VBZ TO VB RP IN DT NN NN,view woman rail trump step inauguration _mytag_parentheses_,1,1,8.696425e-08,1.0,1,...,0.981112,0.018888,0,0,1,0,0,0,0,0
1585,New contender emerges to become Wall Street's top cop: sources,new contender emerges to become wall street's top cop: sources,new contender emerges to become wall street s top cop : sources,NNP NN VBZ TO VB NNP NNP POS JJ NN : NNS,new contender emerge become wall street top cop source,0,0,0.9499795,0.05002,0,...,0.999203,0.000797,0,0,0,0,0,0,0,0
3199,"Clinton criticizes Trump for remarks on security briefing, Putin","clinton criticizes trump for remarks on security briefing, putin",clinton criticizes trump for remarks on security briefing putin,NNP VBZ NNP IN NNS IN NN NN NNP,clinton criticize trump remark security briefing putin,0,0,0.9849583,0.015042,0,...,0.999876,0.000124,0,0,0,0,0,0,0,0
3007,"Tunisian labor union says Jerusalem decision a 'declaration of war', calls for protests","tunisian labor union says jerusalem decision a 'declaration of war', calls for protests",tunisian labor union says jerusalem decision a declaration of war calls for protests,JJ NN NN VBZ NNP NN DT NN IN NN VBZ IN NNS,tunisian labor union say jerusalem decision declaration war call protest,0,0,0.9999747,2.5e-05,0,...,0.99999,1e-05,0,0,0,0,0,0,0,0
4310,French police find more explosives after raid near Paris,french police find more explosives after raid near paris,french police find more explosives after raid near paris,JJ NN VB JJR NNS IN NN IN NNP,french police find explosive raid near paris,0,0,0.9992492,0.000751,0,...,0.999989,1.1e-05,0,0,0,0,0,0,0,0
3649,"""Make Republicans Whole Again!"" A divided party struggles to rally behind Trump","""make republicans whole again!"" a divided party struggles to rally behind trump",make republicans whole again ! a divided party struggles to rally behind trump,VBP NNPS NNP NN . NNP VBD NN NNS TO VB IN NN,make republican whole divide party struggle rally behind trump,0,1,0.2139497,0.78605,1,...,0.999448,0.000552,0,0,0,0,0,0,0,0
178,Three suspected al Qaeda militants killed in Yemen drone strike,three suspected al qaeda militants killed in yemen drone strike,three suspected al qaeda militants killed in yemen drone strike,CD VBN RB NNP NNS VBN IN NNP NN NN,three suspect qaeda militant kill yemen drone strike,0,0,0.9999892,1.1e-05,0,...,0.999884,0.000116,0,0,0,0,0,0,0,0


Index(['org_title', 'lower_title', 'cleaned_words', 'cleaned_pos',
       'minimal_words', 'y_true', 'nb_og', 'nb_og_p0', 'nb_og_p1', 'nb_lo',
       'nb_lo_p0', 'nb_lo_p1', 'nb_cw', 'nb_cw_p0', 'nb_cw_p1', 'nb_ps',
       'nb_ps_p0', 'nb_ps_p1', 'nb_mw', 'nb_mw_p0', 'nb_mw_p1', 'fnn_og',
       'fnn_og_p0', 'fnn_og_p1', 'fnn_lo', 'fnn_lo_p0', 'fnn_lo_p1', 'fnn_cw',
       'fnn_cw_p0', 'fnn_cw_p1', 'fnn_ps', 'fnn_ps_p0', 'fnn_ps_p1', 'fnn_mw',
       'fnn_mw_p0', 'fnn_mw_p1', 'lstm_og', 'lstm_og_p0', 'lstm_og_p1',
       'lstm_lo', 'lstm_lo_p0', 'lstm_lo_p1', 'lstm_cw', 'lstm_cw_p0',
       'lstm_cw_p1', 'lstm_ps', 'lstm_ps_p0', 'lstm_ps_p1', 'lstm_mw',
       'lstm_mw_p0', 'lstm_mw_p1', 'too_long', 'noisy', 'clickbait', 'slang',
       'donald', 'obama', 'hillary', 'bernie'],
      dtype='object')
