In [16]:
max_len = 20

In [20]:
from freq_utils import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

pd.options.display.max_colwidth = 200

In [30]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

In [5]:
df = df0.sample(1)
display(df)

Unnamed: 0.1,Unnamed: 0,title,pos,cleaned_words,cleaned_pos,minimal_words,org_title,label
14553,14553,"Putin, in decree, says Russia's armed forces are 1.9 million-strong","[('Putin', 'NNP'), (',', ','), ('in', 'IN'), ('decree', 'NN'), (',', ','), ('says', 'VBZ'), ('Russia', 'NNP'), (""'s"", 'POS'), ('armed', 'JJ'), ('forces', 'NNS'), ('are', 'VBP'), ('1.9', 'CD'), ('m...",putin in decree says russia s armed forces are 1 . 9 million strong,NNP IN NN VBZ NNP POS JJ NNS VBP CD JJ,putin decree say russia armed force million strong,"Putin, in decree, says Russia's armed forces are 1.9 million-strong",True


In [39]:
train, dev, test = train_dev_test_split([df0, df1], m=1000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

In [32]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname].tolist()
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    return predict, Y_test, proba, classifier, counter

In [56]:
def model_FNN(input_shape, word_to_vec_map, word_to_index, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_vec_map: word to embedding vector dictionary
    word_to_index: word to index dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    X_indices = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, trainable=trainable)
    X = embedding_layer(X_indices)   

    # Take average
    # Get embedding vector dimension
    emb_dim = X.shape[2]
    # Make a list from slice
    X_avg = [ X[:,:,i] for i in range(emb_dim) ]
    # Take average of embedding vector
    X = tf.keras.layers.Average()(X_avg)

    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)

    # Linear+ReLu layer
    X = tfl.Dense(units = 128, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X)  

    # Linear+ReLu layer
    X = tfl.Dense(units = 64, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X) 

    # Linear+ReLu layer
    X = tfl.Dense(units = 32, activation='relu')(X)
    X = tfl.Dropout(rate = 0.2)(X) 

    # Linear+Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Dense(units = n_class, activation='softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_indices, outputs=X)

    return model

In [57]:
def model_LSTM(input_shape, word_to_vec_map, word_to_index, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_vec_map: word to embedding vector dictionary
    word_to_index: word to index dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    X_indices = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index, trainable=trainable)
    X = embedding_layer(X_indices)   

    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)

    # LSTM layer
    # Output: a[1] (m, max_len, 128 hidden unit), batch of sequences
    X = tfl.LSTM(units = 128, return_sequences= True)(X)
    X = tfl.Dropout(rate = 0.5 )(X) 

    # LSTM layer
    # Output: a[2]<max_len> (m, 128 hidden unit)
    X = tfl.LSTM(units = 128)(X)
    X = tfl.Dropout(rate = 0.5)(X)  

    # Linear layer
    # Output: a[3] (m, # classes)
    X = tfl.Dense(units = n_class)(X)

    # Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Activation('softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_indices, outputs=X)

    return model

In [58]:
def run_language_model(model, train, dev, test, Xname='org_title',Yname='label', max_len=20, n_class=2, \
                                                    epochs = 20, batch_size = 32, \
                                                    patience=2, trainable=False):
      
    model = model((max_len,), word_to_vec_map, word_to_index, n_class, trainable)
    model.summary()
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
    
    history = False
    
    if patience :
        history = model.fit(X_train_indices, Y_train_oh, 
                            epochs = epochs, batch_size = batch_size, shuffle=True, 
                            validation_data=(X_dev_indices, Y_dev_oh),
                            callbacks=[es])
    else:
        history = model.fit(X_train_indices, Y_train_oh, 
                            epochs = epochs, batch_size = batch_size, shuffle=True, 
                            validation_data=(X_dev_indices, Y_dev_oh))
        
        
    proba = model.predict(X_test_indices)
    predict = [np.argmax(proba[i]) for i in range(len(proba))]
    
    return predict, Y_test, proba, model, history

In [40]:
word_to_index, index_to_word, word_to_vec_map = get_pretrained_embedding()

_, X_train, X_train_indices, Y_train, Y_train_oh = dataframe_to_arrays(train, word_to_index, max_len)
_, X_dev,   X_dev_indices,   Y_dev,   Y_dev_oh   = dataframe_to_arrays(dev, word_to_index, max_len)
indx_test, X_test,  X_test_indices,  Y_test,  Y_test_oh  = dataframe_to_arrays(test, word_to_index, max_len)

In [41]:
res = simple_NB(train,dev,test,Xname='title',Yname='label')

In [42]:
test_labels = res[1]
predict = res[0]

#print(res)

print(accuracy_score(test_labels, predict))
print(precision_score(test_labels, predict))
print(recall_score(test_labels, predict))
print(f1_score(test_labels, predict))

0.92
0.9381443298969072
0.900990099009901
0.9191919191919191


In [None]:
res = run_language_model(model_LSTM, train, dev, test, Xname='cleaned_words',Yname='label', \
                         max_len=max_len, n_class=2, \
                         epochs = 20, batch_size = 32, \
                         patience=2, trainable=False)


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_3 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm_4 (LSTM)               (None, 20, 128)           91648     
                                                                 
 dropout_7 (Dropout)         (None, 20, 128)           0         
                                                                 
 lstm_5 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_8 (Dropout)         (None, 128)               0   

In [60]:
test_labels = res[1]
predict = res[0]

print(accuracy_score(test_labels, predict))
print(precision_score(test_labels, predict))
print(recall_score(test_labels, predict))
print(f1_score(test_labels, predict))

0.595
0.6612903225806451
0.40594059405940597
0.5030674846625767
