# Set constant

In [1]:
max_len = 20 # max sentence size

# Load libaries

In [2]:
from freq_utils import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical

pd.options.display.max_colwidth = 200

# Load dataset

In [3]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

In [4]:
df = df0.sample(2)
display(df)

Unnamed: 0.1,Unnamed: 0,title,pos,cleaned_words,cleaned_pos,minimal_words,org_title,lower_title,label
18075,18075,"Iranian president defends nuclear deal, says Trump can not undermine it","[('Iranian', 'JJ'), ('president', 'NN'), ('defends', 'NNS'), ('nuclear', 'JJ'), ('deal', 'NN'), (',', ','), ('says', 'VBZ'), ('Trump', 'NNP'), ('can', 'MD'), ('not', 'RB'), ('undermine', 'VB'), ('...",iranian president defends nuclear deal says trump can not undermine it,JJ NN NNS JJ NN VBZ NNP MD RB VB PRP,iranian president defends nuclear deal say trump undermine,"Iranian president defends nuclear deal, says Trump can not undermine it","iranian president defends nuclear deal, says trump can not undermine it",0
14608,14608,"Zimbabwe's Mugabe, coup chief meet with smiles and handshakes","[('Zimbabwe', 'NNP'), (""'s"", 'POS'), ('Mugabe', 'NNP'), (',', ','), ('coup', 'NN'), ('chief', 'JJ'), ('meet', 'NN'), ('with', 'IN'), ('smiles', 'NNS'), ('and', 'CC'), ('handshakes', 'NNS')]",zimbabwe s mugabe coup chief meet with smiles and handshakes,NNP POS NNP NN JJ NN IN NNS CC NNS,zimbabwe mugabe coup chief meet smile handshake,"Zimbabwe's Mugabe, coup chief meet with smiles and handshakes","zimbabwe's mugabe, coup chief meet with smiles and handshakes",0


# Make dictionaries

### Pretrained word embeddings
- Word to index
- Word to vector

In [5]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [6]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(pos_to_index)

{'jjs': 0, 'vbn': 1, 'fw': 2, 'nnps': 3, 'ex': 4, 'dt': 5, 'cd': 6, 'wrb': 7, 'in': 8, 'vbg': 9, 'cc': 10, 'vbp': 11, 'rb': 12, 'prp': 13, 'jj': 14, 'uh': 15, 'pos': 16, 'wdt': 17, 'to': 18, 'wp$': 19, 'vbd': 20, 'pdt': 21, 'vbz': 22, 'nn': 23, 'vb': 24, 'jjr': 25, "''": 26, '.': 27, 'rp': 28, 'rbs': 29, 'rbr': 30, ':': 31, '$': 32, 'sym': 33, 'nnp': 34, 'nns': 35, 'wp': 36, 'md': 37, 'prp$': 38}


In [7]:
len(pos_to_index)

39

# Train/dev/test split

In [8]:
train, dev, test = train_dev_test_split([df0, df1], m=40000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

# Define models

In [9]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname].tolist()
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    model_name = 'Naive Bayse - '+Xname
    
    
    return model_name, predict, Y_test, proba, classifier, counter

In [10]:
def model_FNN(input_shape, word_to_index, word_to_vector, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_index: word to index dictionary
    word_to_vector: word to embedding vector dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')
    
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
    else:
        # One hot encoding (=input)
        # Output: (m, max_len, num_cat)
        X = tf.dtypes.cast(X, tf.float32)    

    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)
    
    # Take average
    max_len = X.shape[1]
    # Make a list from slice
    X_avg = [ X[:,i,:] for i in range(max_len) ]
    # Take average of a sentence
    X = tf.keras.layers.Average()(X_avg)    
        
    # Linear+ReLu layer
    X = tfl.Dense(units = 128, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X)  

    # Linear+ReLu layer
    X = tfl.Dense(units = 64, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X) 

    # Linear+ReLu layer
    X = tfl.Dense(units = 32, activation='relu')(X)
    X = tfl.Dropout(rate = 0.2)(X) 

    # Linear+Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Dense(units = n_class, activation='softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

In [11]:
def model_LSTM(input_shape, word_to_index, word_to_vector=False, n_class=2, trainable=False):
    '''
    input_shape: (max_len,) or (max_len, num_cat)
    word_to_index: word to index dictionary, False for one hot encoding
    word_to_vector: word to embedding vector dictionary, False for one hot encoding

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_oh (m, max_len, num_cat)
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    #embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
    #X = embedding_layer(X_indices)   

    # By default, assumes one hot vector input
    # If word_to_index, word_to_vector is provided, add an embedding layer
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
        # Masking layer
        # skip zero vector words
        X = tfl.Masking(mask_value=0.)(X)
        
    else:
        X = tf.dtypes.cast(X, tf.float32)

    # LSTM layer
    # Output: a[1] (m, max_len, 128 hidden unit), batch of sequences
    X = tfl.LSTM(units = 128, return_sequences= True)(X)
    X = tfl.Dropout(rate = 0.5 )(X) 

    # LSTM layer
    # Output: a[2]<max_len> (m, 128 hidden unit)
    X = tfl.LSTM(units = 128)(X)
    X = tfl.Dropout(rate = 0.5)(X)  

    # Linear layer
    # Output: a[3] (m, # classes)
    X = tfl.Dense(units = n_class)(X)

    # Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Activation('softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

# Model wrapper functions

In [12]:
def run_multiple_NB(model_vars,train,dev,test,Yname='label'):

    results = []

    for var in model_vars:
        
        Xname = var
        
        model_name, y_pred, y_true, proba, classifier, counter = \
            simple_NB(train,dev,test,Xname=Xname,Yname=Yname)
        
        x = test[Xname].to_numpy()
        
        model_name = model_name+' '+Xname
        
        results.append([model_name, x, y_pred, y_true, proba, classifier, counter])
        
    return results

In [13]:
def run_multiple_NN(model_vars, train, dev, test, pos_to_index, word_to_index=False, word_to_vector=False, 
                        Yname='label', max_len=20, n_class=2, 
                        epochs = 20, batch_size = 32, patience=2, trainable=False):
    
    
    results = []

    for i in range(len(model_vars)):
            
        model_name = model_vars[i][0]
        func_model = model_vars[i][1]
        Xname = model_vars[i][2]
        use_embeddings = model_vars[i][3]
        
        model_name = model_name+' '+Xname
        
        w2i = False
        w2v = False
        X_shape = False
        
        if use_embeddings:
            w2i = word_to_index        
            w2v = word_to_vector
            X_shape = (max_len, )
        else:
            w2i = pos_to_index
            X_shape = (max_len, len(pos_to_index))

        _, _, X_train_indices, _, Y_train_oh = dataframe_to_arrays(train, w2i, max_len, Xname=Xname)
        _, _, X_dev_indices,   _, Y_dev_oh   = dataframe_to_arrays(dev, w2i, max_len, Xname=Xname)
        index, _, X_test_indices, Y_test, _  = dataframe_to_arrays(test, w2i, max_len, Xname=Xname)

        
        X = False
        if use_embeddings:
            X = [X_train_indices, X_dev_indices, X_test_indices]
        else:
            X = [to_categorical(X_train_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_dev_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_test_indices, num_classes=len(pos_to_index))]            

        
        Y = [Y_train_oh, Y_dev_oh, Y_test]

        
    
        print('should match all: ', X_shape, X[0].shape, X[1].shape, X[2].shape)
    
        model = func_model(X_shape, w2i, w2v, n_class, trainable)
        model.summary()
                    
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)  
        
        history = False
        
        if patience :
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]),
                                callbacks=[es])
        else:
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]))
            
        proba = model.predict(X[2])
        y_pred = [np.argmax(proba[i]) for i in range(len(proba))]
        y_true = Y[2]
        x = X[2]
        
        results.append([model_name, x, y_pred, y_true, proba, model, history])
    
    return results

# Run models

In [14]:
seq_type = ['Original','Lower','CleanedWords','PoS','MinimalWords']
ml_type = ['NaiveBayse','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words' ]


nn_vars = []
for i in range(n_seq):
    title = title_vars[i]
    oh = True
    if title=='cleaned_pos':
        oh=False
    nn_vars.append(['FNN',model_FNN,title,oh])
        
for i in range(n_seq):
    title = title_vars[i]
    oh = True
    if title=='cleaned_pos':
        oh=False
    nn_vars.append(['LSTM',model_LSTM,title,oh])
    



In [None]:
# Naive Bayes
res_nb =  run_multiple_NB(title_vars,train,dev,test,Yname='label')
    

# Neural Networks

res_nn = run_multiple_NN(nn_vars, train, dev, test,
                         pos_to_index, word_to_index, word_to_vector, 
                         Yname='label', max_len=max_len, n_class=2,
                         epochs = 50, batch_size = 32, patience=4, trainable=True)

# Add NB and NN
results = res_nb + res_nn

should match all:  (20,) (24000, 20) (8000, 20) (8000, 20)


2022-02-19 23:07:26.125316: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 50)       20000050    ['input_1[0][0]']                
                                                                                                  
 masking (Masking)              (None, 20, 50)       0           ['embedding[0][0]']              
                                                                                                  
 tf.__operators__.getitem (Slic  (None, 50)          0           ['masking[0][0]']                
 ingOpLambda)                                                                                 

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20,) (24000, 20) (8000, 20) (8000, 20)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 20, 50)       20000050    ['input_2[0][0]']                
                                                                                                  
 masking_1 (Masking)            (None, 20, 50)       0           ['embedding_1[0][0]']            
                                                                                                  
 tf.__operators__.getitem_20 (S  (None

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20,) (24000, 20) (8000, 20) (8000, 20)
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 20, 50)       20000050    ['input_3[0][0]']                
                                                                                                  
 masking_2 (Masking)            (None, 20, 50)       0           ['embedding_2[0][0]']            
                                                                                                  
 tf.__operators__.getitem_40 (S  (None

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20, 39) (24000, 20, 39) (8000, 20, 39) (8000, 20, 39)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 20, 39)]     0           []                               
                                                                                                  
 tf.cast (TFOpLambda)           (None, 20, 39)       0           ['input_4[0][0]']                
                                                                                                  
 masking_3 (Masking)            (None, 20, 39)       0           ['tf.cast[0][0]']                
                                                                                                  
 tf.__operators__.getit

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 00014: early stopping
should match all:  (20,) (24000, 20) (8000, 20) (8000, 20)
Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 20, 50)       20000050    ['input_5[0][0]']                
                                                                                                  
 masking_4 (Masking)            (None, 20, 50)       0           ['embedding_3[0][0]']            
                                                       

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20,) (24000, 20) (8000, 20) (8000, 20)
Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_5 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm (LSTM)                 (None, 20, 128)           91648     
                                                                 
 dropout_15 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 12

# Print result

In [None]:
def print_result(results):
    
    for result in results:
        
        model_name, x, y_pred, y_true, proba = result[:5]
        
        print(model_name)
        print('accuracy: ',accuracy_score(y_true, y_pred))
        print('precision: ',precision_score(y_true, y_pred))
        print('recall: ',recall_score(y_true, y_pred))
        print('f1: ',f1_score(y_true, y_pred))
        print('\n')

In [None]:
print_result(results)

#print(len(res_nn))

# Organize results into a dataframe

In [None]:
df_result = []

for i in range(len(results)):

    df = pd.DataFrame({
        'x': results[i][1].tolist(),
        'y_true': results[i][3],
        'y_pred': results[i][2],
        'proba0': [results[i][4][j][0] for j in range(len(results[i][1])) ],
        'proba1': [results[i][4][j][1] for j in range(len(results[i][1])) ]
    })
    df_result.append(df)

In [None]:
ml_type = ['NaiveBayse','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words']

dict_name = []

for i in range(n_ml):
    for j in range(n_seq):
        
        y_pred = ml_type_abb[i]+'_'+seq_type_abb[j]
        proba0 = y_pred+'_p0'
        proba1 = y_pred+'_p1'
        temp_dict = {'y_pred':y_pred,'proba0':proba0,'proba1':proba1}
        
        if i==0:
            x = title_vars[j]
            temp_dict['x'] = x
        
        dict_name.append(temp_dict)

df = []

df.append(df_result[0].rename(columns = dict_name[0]))
for i in range(1,n_seq):
    df.append(df_result[i].drop(['y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))

for i in range(n_seq, n_ml*n_seq):
    df.append(df_result[i].drop(['x','y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))


In [None]:
df_x = df[0][title_vars[0]].to_frame()
for i in range(1,n_seq):
    df_x = pd.merge(df_x, df[i][title_vars[i]].to_frame(), left_index=True, right_index=True)

df_y = df[0].y_true.to_frame()    
    
df[0].drop(['y_true'], axis=1, inplace=True)
for i in range(0,n_seq):
    df[i].drop([title_vars[i]], axis=1, inplace=True)

df_p = df[0]    
for i in range(1,n_seq*n_ml):
    df_p = pd.merge(df_p,df[i], left_index=True, right_index=True)
    
df_final = pd.merge(df_x, df_y, left_index=True, right_index=True)
df_final = pd.merge(df_final, df_p, left_index=True, right_index=True)


In [None]:
display(df_final.sample(2))

print(df_final.columns)

In [None]:
df_final.to_csv('data/model_compare2.csv',index=False)