# Set constant

In [1]:
max_len = 20 # max sentence size

# Load libaries

In [2]:
from freq_utils import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical

pd.options.display.max_colwidth = 200

# Load dataset

In [3]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

In [4]:
df = df0.sample(2)
display(df)

Unnamed: 0.1,Unnamed: 0,title,pos,cleaned_words,cleaned_pos,minimal_words,org_title,label
13007,13007,Greek top court to decide Dec 13 on Russia cyber suspect extradition,"[('Greek', 'JJ'), ('top', 'JJ'), ('court', 'NN'), ('to', 'TO'), ('decide', 'VB'), ('Dec', 'NNP'), ('13', 'CD'), ('on', 'IN'), ('Russia', 'NNP'), ('cyber', 'NN'), ('suspect', 'NN'), ('extradition',...",greek top court to decide dec 13 on russia cyber suspect extradition,JJ JJ NN TO VB NNP CD IN NNP NN NN NN,greek top court decide dec russia cyber suspect extradition,Greek top court to decide Dec. 13 on Russia cyber suspect extradition,0
4245,4245,Trump administration issues final rule on stricter Obamacare enrollment,"[('Trump', 'NNP'), ('administration', 'NN'), ('issues', 'NNS'), ('final', 'JJ'), ('rule', 'NN'), ('on', 'IN'), ('stricter', 'NN'), ('Obamacare', 'NNP'), ('enrollment', 'NN')]",trump administration issues final rule on stricter obamacare enrollment,NNP NN NNS JJ NN IN NN NNP NN,trump administration issue final rule stricter obamacare enrollment,Trump administration issues final rule on stricter Obamacare enrollment,0


# Make dictionaries

### Pretrained word embeddings
- Word to index
- Word to vector

In [6]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [None]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(pos_to_index)

In [None]:
len(pos_to_index)

# Train/dev/test split

In [None]:
train, dev, test = train_dev_test_split([df0, df1], m=1000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

# Define models

In [None]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname].tolist()
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    model_name = 'Naive Bayse - '+Xname
    
    
    return model_name, predict, Y_test, proba, classifier, counter

In [None]:
def model_FNN(input_shape, word_to_index, word_to_vector, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_index: word to index dictionary
    word_to_vector: word to embedding vector dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')
    
    # Embedding layer
    embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
    X = embedding_layer(X_input)   

    # Take average
    # Get embedding vector dimension
    emb_dim = X.shape[2]
    # Make a list from slice
    X_avg = [ X[:,:,i] for i in range(emb_dim) ]
    # Take average of embedding vector
    X = tf.keras.layers.Average()(X_avg)


    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)

    # Linear+ReLu layer
    X = tfl.Dense(units = 128, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X)  

    # Linear+ReLu layer
    X = tfl.Dense(units = 64, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X) 

    # Linear+ReLu layer
    X = tfl.Dense(units = 32, activation='relu')(X)
    X = tfl.Dropout(rate = 0.2)(X) 

    # Linear+Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Dense(units = n_class, activation='softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

In [None]:
def model_LSTM(input_shape, word_to_index=False, word_to_vector=False, n_class=2, trainable=False):
    '''
    input_shape: (max_len,) or (max_len, num_cat)
    word_to_index: word to index dictionary, False for one hot encoding
    word_to_vector: word to embedding vector dictionary, False for one hot encoding

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_oh (m, max_len, num_cat)
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    #embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
    #X = embedding_layer(X_indices)   

    # By default, assumes one hot vector input
    # If word_to_index, word_to_vector is provided, add an embedding layer
    X = X_input
    if bool(word_to_index):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
        # Masking layer
        # skip zero vector words
        X = tfl.Masking(mask_value=0.)(X)
        
    else:
        X = tf.dtypes.cast(X, tf.float32)

    # LSTM layer
    # Output: a[1] (m, max_len, 128 hidden unit), batch of sequences
    X = tfl.LSTM(units = 128, return_sequences= True)(X)
    X = tfl.Dropout(rate = 0.5 )(X) 

    # LSTM layer
    # Output: a[2]<max_len> (m, 128 hidden unit)
    X = tfl.LSTM(units = 128)(X)
    X = tfl.Dropout(rate = 0.5)(X)  

    # Linear layer
    # Output: a[3] (m, # classes)
    X = tfl.Dense(units = n_class)(X)

    # Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Activation('softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

# Model wrapper functions

In [None]:
def run_multiple_NB(model_vars,train,dev,test,Yname='label'):

    results = []

    for var in model_vars:
        
        Xname = var
        
        model_name, y_pred, y_true, proba, classifier, counter = \
            simple_NB(train,dev,test,Xname=Xname,Yname=Yname)
        
        results.append([model_name, y_pred, y_true, proba, classifier, counter])
        
    return results

In [None]:
def run_multiple_NN(model_vars, train, dev, test, pos_to_index, word_to_index=False, word_to_vector=False, 
                        Yname='label', max_len=20, n_class=2, 
                        epochs = 20, batch_size = 32, patience=2, trainable=False):
    
    
    results = []
    
    _, _, X_train_indices, _,      Y_train_oh = dataframe_to_arrays(train, word_to_index, max_len)
    _, _, X_dev_indices,   _,      Y_dev_oh   = dataframe_to_arrays(dev, word_to_index, max_len)
    _, _, X_test_indices,  Y_test, _          = dataframe_to_arrays(test, word_to_index, max_len)
    
    _, _, X_train_indices_pos, _, _ = dataframe_to_arrays(train, pos_to_index, max_len)
    _, _, X_dev_indices_pos,   _, _ = dataframe_to_arrays(dev, pos_to_index, max_len)
    _, _, X_test_indices_pos,  _, _ = dataframe_to_arrays(test, pos_to_index, max_len)    
    

    X_emb = [X_train_indices, X_dev_indices, X_test_indices]
    X_ohv = [to_categorical(X_train_indices_pos, num_classes=len(pos_to_index)), 
             to_categorical(X_dev_indices_pos, num_classes=len(pos_to_index)), 
             to_categorical(X_test_indices_pos, num_classes=len(pos_to_index))]
    Y = [Y_train_oh, Y_dev_oh, Y_test]
    

    
    for i in range(len(model_vars)):
            
        model_name = model_vars[i][0]
        func_model = model_vars[i][1]
        Xname = model_vars[i][2]
        use_embeddings = model_vars[i][3]
        
        X = X_ohv
        w2v = False
        w2i = False
        X_shape = (max_len, len(pos_to_index))
        
        if use_embeddings:
            X = X_emb
            w2i = word_to_index        
            w2v = word_to_vector
            X_shape = (max_len, )
        
    
        print('should match all: ', X_shape, X[0].shape, X[1].shape, X[2].shape)
    
        model = func_model(X_shape, w2i, w2v, n_class, trainable)
        model.summary()
                    
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)  
        
        history = False
        
        if patience :
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]),
                                callbacks=[es])
        else:
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]))
            
        proba = model.predict(X[2])
        y_pred = [np.argmax(proba[i]) for i in range(len(proba))]
        y_true = Y[2]
        
        results.append([model_name, y_pred, y_true, proba, model, history])
    
    return results

# Run models

In [None]:
# Naive Bayes
#simple_NB(train,dev,test,Xname='title',Yname='label') 
nn_vars = ['cleaned_words','minimal_words','cleaned_pos' ]
res_nb =  run_multiple_NB(nn_vars,train,dev,test,Yname='label')
    

# NN
nn_vars = [#['LSTM clean words', model_LSTM, 'cleaned_words', True], 
              ['LSTM clean pos', model_LSTM, 'cleaned_pos', False],
            ['FNN clean words', model_FNN, 'cleaned_words', True]]

res_nn = run_multiple_NN(nn_vars, train, dev, test,
                         pos_to_index, word_to_index, word_to_vector, 
                         Yname='label', max_len=max_len, n_class=2,
                         epochs = 20, batch_size = 32, patience=2, trainable=False)

# Add NB and NN
results = res_nb + res_nn


# Print result

In [None]:
def print_result(results):
    
    for result in results:
        
        model_name, y_pred, y_true, proba = result[:4]
        
        print(model_name)
        print('accuracy: ',accuracy_score(y_true, y_pred))
        print('precision: ',precision_score(y_true, y_pred))
        print('recall: ',recall_score(y_true, y_pred))
        print('f1: ',f1_score(y_true, y_pred))
        print('\n')

In [None]:
print_result(results)

#print(len(res_nn))