# Set constant

In [1]:
max_len = 20 # max sentence size

# Load libaries

In [2]:
from freq_utils import *

import regex as re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical

pd.options.display.max_colwidth = 200

# Load dataset

In [3]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

# Make dictionaries

### Pretrained word embeddings
- Word to index
- Word to vector

In [4]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [5]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(pos_to_index)

{'uh': 0, ':': 1, "''": 2, 'wp$': 3, 'jjs': 4, 'pos': 5, 'rb': 6, 'dt': 7, 'vbd': 8, '$': 9, 'prp$': 10, 'to': 11, 'vbp': 12, 'cd': 13, 'fw': 14, 'cc': 15, 'rbs': 16, 'wp': 17, '.': 18, 'in': 19, 'vbz': 20, 'nns': 21, 'md': 22, 'prp': 23, 'ex': 24, 'vb': 25, 'pdt': 26, 'jjr': 27, 'nn': 28, 'nnps': 29, 'rp': 30, 'wdt': 31, 'vbn': 32, 'nnp': 33, 'sym': 34, 'wrb': 35, 'vbg': 36, 'jj': 37, 'rbr': 38}


In [6]:
len(pos_to_index)

39

# Train/dev/test split

In [7]:
train, dev, test = train_dev_test_split([df0, df1], m=500, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

# Define models

In [8]:
def simple_Manual(test):
    
    y_true = test.label
    y_pred = []
    
    # title size > 20?
    y_pred.append( test.apply(lambda row: 1 if len(row['lower_title'].split())> 20 else 0, axis=1) )
    # noise > 3 
    y_pred.append( test.apply(lambda row: 1 if len(re.findall(re.compile('[^\s\w]'), row['lower_title'])) > 5 else 0, axis=1) )
    # clickbait, slang, first names
    trigger_word = ['_mytag_parentheses_', '_mytag_slang_',
                    'donald','obama','hillary','bernie']

    for i in range(len(trigger_word)):
        y_pred.append( test.minimal_words.str.contains(trigger_word[i])*1 )

    cut_name = ['too_long','noisy','clickbait','slang'] + trigger_word[-4:]

    return cut_name, y_true, y_pred

In [9]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname]#.tolist() -> commented out to keep dataframe index
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    model_name = 'Naive Bayse - '+Xname
    
    
    return model_name, predict, Y_test, proba, classifier, counter

In [10]:
def model_FNN(input_shape, word_to_index, word_to_vector, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_index: word to index dictionary
    word_to_vector: word to embedding vector dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')
    
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
    else:
        # One hot encoding (=input)
        # Output: (m, max_len, num_cat)
        X = tf.dtypes.cast(X, tf.float32)    

    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)
    
    # Take average
    max_len = X.shape[1]
    # Make a list from slice
    X_avg = [ X[:,i,:] for i in range(max_len) ]
    # Take average of a sentence
    X = tf.keras.layers.Average()(X_avg)    
        
    # Linear+ReLu layer
    X = tfl.Dense(units = 128, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X)  

    # Linear+ReLu layer
    X = tfl.Dense(units = 64, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X) 

    # Linear+ReLu layer
    X = tfl.Dense(units = 32, activation='relu')(X)
    X = tfl.Dropout(rate = 0.2)(X) 

    # Linear+Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Dense(units = n_class, activation='softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

In [11]:
def model_LSTM(input_shape, word_to_index, word_to_vector=False, n_class=2, trainable=False):
    '''
    input_shape: (max_len,) or (max_len, num_cat)
    word_to_index: word to index dictionary, False for one hot encoding
    word_to_vector: word to embedding vector dictionary, False for one hot encoding

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_oh (m, max_len, num_cat)
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    #embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
    #X = embedding_layer(X_indices)   

    # By default, assumes one hot vector input
    # If word_to_index, word_to_vector is provided, add an embedding layer
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
        # Masking layer
        # skip zero vector words
        X = tfl.Masking(mask_value=0.)(X)
        
    else:
        X = tf.dtypes.cast(X, tf.float32)

    # LSTM layer
    # Output: a[1] (m, max_len, 128 hidden unit), batch of sequences
    X = tfl.LSTM(units = 128, return_sequences= True)(X)
    X = tfl.Dropout(rate = 0.5 )(X) 

    # LSTM layer
    # Output: a[2]<max_len> (m, 128 hidden unit)
    X = tfl.LSTM(units = 128)(X)
    X = tfl.Dropout(rate = 0.5)(X)  

    # Linear layer
    # Output: a[3] (m, # classes)
    X = tfl.Dense(units = n_class)(X)

    # Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Activation('softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

# Model wrapper functions

In [12]:
def run_multiple_NB(model_vars,train,dev,test,Yname='label'):

    results = []

    for var in model_vars:
        
        Xname = var
        
        model_name, y_pred, y_true, proba, classifier, counter = \
            simple_NB(train,dev,test,Xname=Xname,Yname=Yname)
        
        x = test[Xname].to_numpy()
        
        model_name = model_name+' '+Xname
        
        results.append([model_name, x, y_pred, y_true, proba, classifier, counter])
        
    return results

In [13]:
def run_multiple_NN(model_vars, train, dev, test, pos_to_index, word_to_index=False, word_to_vector=False, 
                        Yname='label', max_len=20, n_class=2, 
                        epochs = 20, batch_size = 32, patience=2, trainable=False):
    
    
    results = []

    for i in range(len(model_vars)):
            
        model_name = model_vars[i][0]
        func_model = model_vars[i][1]
        Xname = model_vars[i][2]
        use_embeddings = model_vars[i][3]
        
        model_name = model_name+' '+Xname
        
        w2i = False
        w2v = False
        X_shape = False
        
        if use_embeddings:
            w2i = word_to_index        
            w2v = word_to_vector
            X_shape = (max_len, )
        else:
            w2i = pos_to_index
            X_shape = (max_len, len(pos_to_index))

        _, _, X_train_indices, _, Y_train_oh = dataframe_to_arrays(train, w2i, max_len, Xname=Xname)
        _, _, X_dev_indices,   _, Y_dev_oh   = dataframe_to_arrays(dev, w2i, max_len, Xname=Xname)
        index, _, X_test_indices, Y_test, _  = dataframe_to_arrays(test, w2i, max_len, Xname=Xname)

        
        X = False
        if use_embeddings:
            X = [X_train_indices, X_dev_indices, X_test_indices]
        else:
            X = [to_categorical(X_train_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_dev_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_test_indices, num_classes=len(pos_to_index))]            

        
        Y = [Y_train_oh, Y_dev_oh, Y_test]

        
    
        print('should match all: ', X_shape, X[0].shape, X[1].shape, X[2].shape)
    
        model = func_model(X_shape, w2i, w2v, n_class, trainable)
        model.summary()
                    
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)  
        
        history = False
        
        if patience :
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]),
                                callbacks=[es])
        else:
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]))
            
        proba = model.predict(X[2])
        y_pred = [np.argmax(proba[i]) for i in range(len(proba))]
        y_true = Y[2]
        x = X[2]
        
        results.append([model_name, x, y_pred, y_true, proba, model, history])
    
    return results

# Run models

In [14]:
seq_type = ['Original','Lower','CleanedWords','PoS','MinimalWords']
ml_type = ['NaiveBayse','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words' ]


nn_vars = []
for i in range(n_seq):
    title = title_vars[i]
    oh = True
    if title=='cleaned_pos':
        oh=False
    nn_vars.append(['FNN',model_FNN,title,oh])
        
for i in range(n_seq):
    title = title_vars[i]
    oh = True
    if title=='cleaned_pos':
        oh=False
    nn_vars.append(['LSTM',model_LSTM,title,oh])
    



In [15]:
# Naive Bayes
res_nb =  run_multiple_NB(title_vars,train,dev,test,Yname='label')
    

# Neural Networks

res_nn = run_multiple_NN(nn_vars, train, dev, test,
                         pos_to_index, word_to_index, word_to_vector, 
                         Yname='label', max_len=max_len, n_class=2,
                         epochs = 3, batch_size = 32, patience=2, trainable=True)

# Add NB and NN
results = res_nb + res_nn

should match all:  (20,) (300, 20) (100, 20) (100, 20)


2022-02-20 03:28:25.519202: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 50)       20000050    ['input_1[0][0]']                
                                                                                                  
 masking (Masking)              (None, 20, 50)       0           ['embedding[0][0]']              
                                                                                                  
 tf.__operators__.getitem (Slic  (None, 50)          0           ['masking[0][0]']                
 ingOpLambda)                                                                                 

Epoch 2/3
Epoch 3/3
should match all:  (20,) (300, 20) (100, 20) (100, 20)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 20, 50)       20000050    ['input_2[0][0]']                
                                                                                                  
 masking_1 (Masking)            (None, 20, 50)       0           ['embedding_1[0][0]']            
                                                                                                  
 tf.__operators__.getitem_20 (S  (None, 50)          0           ['masking_1[0][0]']              
 licingOpLambda) 

Epoch 2/3
Epoch 3/3
should match all:  (20,) (300, 20) (100, 20) (100, 20)
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 20, 50)       20000050    ['input_3[0][0]']                
                                                                                                  
 masking_2 (Masking)            (None, 20, 50)       0           ['embedding_2[0][0]']            
                                                                                                  
 tf.__operators__.getitem_40 (S  (None, 50)          0           ['masking_2[0][0]']              
 licingOpLambda) 

Epoch 2/3
Epoch 3/3
should match all:  (20, 39) (300, 20, 39) (100, 20, 39) (100, 20, 39)
Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 20, 39)]     0           []                               
                                                                                                  
 tf.cast (TFOpLambda)           (None, 20, 39)       0           ['input_4[0][0]']                
                                                                                                  
 masking_3 (Masking)            (None, 20, 39)       0           ['tf.cast[0][0]']                
                                                                                                  
 tf.__operators__.getitem_60 (S  (None, 39)          0           ['masking_3[0][0]']              
 l

Epoch 2/3
Epoch 3/3
should match all:  (20,) (300, 20) (100, 20) (100, 20)
Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 20, 50)       20000050    ['input_5[0][0]']                
                                                                                                  
 masking_4 (Masking)            (None, 20, 50)       0           ['embedding_3[0][0]']            
                                                                                                  
 tf.__operators__.getitem_80 (S  (None, 50)          0           ['masking_4[0][0]']              
 licingOpLambda) 

Epoch 2/3
Epoch 3/3
should match all:  (20,) (300, 20) (100, 20) (100, 20)
Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_5 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm (LSTM)                 (None, 20, 128)           91648     
                                                                 
 dropout_15 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                  

Epoch 2/3
Epoch 3/3
should match all:  (20,) (300, 20) (100, 20) (100, 20)
Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 20)]              0         
                                                                 
 embedding_7 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_8 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm_8 (LSTM)               (None, 20, 128)           91648     
                                                                 
 dropout_23 (Dropout)        (None, 20, 128)           0         
                                                                 
 lstm_9 (LSTM)               (None, 128)               131584    
                                                  

# Print result

In [16]:
def print_result(results):
    
    for result in results:
        
        model_name, x, y_pred, y_true, proba = result[:5]
        
        y_true = y_true.to_numpy()
        
        print(model_name)
        print('accuracy: ',accuracy_score(y_true, y_pred))
        print('precision: ',precision_score(y_true, y_pred))
        print('recall: ',recall_score(y_true, y_pred))
        print('f1: ',f1_score(y_true, y_pred))
        print('\n')

# Organize results into a dataframe

In [17]:
df_result = []

for i in range(len(results)):

    df = pd.DataFrame({
        'x': results[i][1].tolist(),
        'y_true': results[i][3].to_numpy().tolist(),
        'y_pred': results[i][2],
        'proba0': [results[i][4][j][0] for j in range(len(results[i][1])) ],
        'proba1': [results[i][4][j][1] for j in range(len(results[i][1])) ]
    }, index = results[i][3].index)
    df_result.append(df)

In [18]:
ml_type = ['NaiveBayse','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words']

dict_name = []

for i in range(n_ml):
    for j in range(n_seq):
        
        y_pred = ml_type_abb[i]+'_'+seq_type_abb[j]
        proba0 = y_pred+'_p0'
        proba1 = y_pred+'_p1'
        temp_dict = {'y_pred':y_pred,'proba0':proba0,'proba1':proba1}
        
        if i==0:
            x = title_vars[j]
            temp_dict['x'] = x
        
        dict_name.append(temp_dict)

df = []

df.append(df_result[0].rename(columns = dict_name[0]))
for i in range(1,n_seq):
    df.append(df_result[i].drop(['y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))

for i in range(n_seq, n_ml*n_seq):
    df.append(df_result[i].drop(['x','y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))


In [19]:
df_x = df[0][title_vars[0]].to_frame()
for i in range(1,n_seq):
    df_x = pd.merge(df_x, df[i][title_vars[i]].to_frame(), left_index=True, right_index=True)
    #print(i,len(df_x))

df_y = df[0].y_true.to_frame() 
print('y',len(df_y))
    
df[0].drop(['y_true'], axis=1, inplace=True)
for i in range(0,n_seq):
    df[i].drop([title_vars[i]], axis=1, inplace=True)

df_p = df[0]    
for i in range(1,n_seq*n_ml):
    df_p = pd.merge(df_p,df[i], left_index=True, right_index=True)
    
df_ml = pd.merge(df_x, df_y, left_index=True, right_index=True)
df_ml = pd.merge(df_ml, df_p, left_index=True, right_index=True)

y 100


In [20]:
cut_name, y_true_cut, y_pred_cut = simple_Manual(test)

In [21]:
dict_manual = {cut_name[i]:y_pred_cut[i] for i in range(len(cut_name))}

df_manual = pd.DataFrame(dict_manual)

#display(df_manual)

In [22]:
df_final = pd.merge(df_ml, df_manual, left_index=True, right_index=True)


In [23]:
#display(df_ml)
#display(df_final)
display(df_final.sample(10))

print(df_final.columns)

df_final.to_csv('data/model_compare2.csv',index=False)

Unnamed: 0,org_title,lower_title,cleaned_words,cleaned_pos,minimal_words,y_true,nb_og,nb_og_p0,nb_og_p1,nb_lo,...,lstm_mw_p0,lstm_mw_p1,too_long,noisy,clickbait,slang,donald,obama,hillary,bernie
279,BILL O’REILLY Destroys Liberal Pundit On Trump Coverage [Video],bill o’reilly destroys liberal pundit on trump coverage [video],bill o reilly destroys liberal pundit on trump coverage _mytag_parentheses_,NNP NNP NNP NNP NNP IN NNP NNP NN,bill reilly destroys liberal pundit trump coverage _mytag_parentheses_,1,1,0.000483,0.999517,1,...,0.113863,0.886137,0,0,1,0,0,0,0,0
190,Illinois House Democrats advance FY 2017 budget amid veto threat,illinois house democrats advance fy 2017 budget amid veto threat,illinois house democrats advance fy 2017 budget amid veto threat,NNP NNP NNPS VBP NNP CD NN IN NN NN,illinois house democrat advance 2017 budget amid veto threat,0,0,0.999616,0.000384,0,...,0.962859,0.037141,0,0,0,0,0,0,0,0
281,Fake News: The Collapse of the MSM’s ‘Facebook Russian Bot’ Story,fake news: the collapse of the msm’s ‘facebook russian bot’ story,fake news : the collapse of the msm s facebook russian bot story,JJ NNS : DT NNP IN DT NNP NNP NNP NNP NNP,fake news collapse msm facebook russian bot story,1,1,0.001602,0.998398,1,...,0.12128,0.87872,0,0,0,0,0,0,0,0
161,Turkey seeks life sentences for 60 ex-military over 1997 'post-modern coup',turkey seeks life sentences for 60 ex-military over 1997 'post-modern coup',turkey seeks life sentences for 60 ex military over 1997 post modern coup,NNP VBZ NN NNS IN CD JJ IN CD JJ NN,turkey seek life sentence military 1997 post modern coup,0,0,0.99959,0.00041,0,...,0.986181,0.013819,0,0,0,0,0,0,0,0
4,Young blacks more open to Bernie Sanders' White House bid,young blacks more open to bernie sanders' white house bid,young blacks more open to bernie sanders white house bid,NN NNS RBR JJ TO NNP NNP NNP NNP NN,young black open bernie sander white house bid,0,0,0.974674,0.025326,0,...,0.026989,0.973011,0,0,0,0,0,0,0,1
25,France sees U.S. strike on Syria as tool to push for peace talks: Hollande,france sees u.s. strike on syria as tool to push for peace talks: hollande,france sees u . s . strike on syria as tool to push for peace talks : hollande,NNP VBZ NNP NN IN NNP IN NN TO VB IN NN NNS : NN,france see strike syria tool push peace talk hollande,0,0,0.998889,0.001111,0,...,0.989734,0.010266,0,0,0,0,0,0,0,0
73,Trump says may tie infrastructure with healthcare or tax reform: NY Times,trump says may tie infrastructure with healthcare or tax reform: ny times,trump says may tie infrastructure with healthcare or tax reform : ny times,NNP VBZ MD VB NN IN NN CC NN NN : NNP NNP,trump say may tie infrastructure healthcare tax reform time,0,0,0.998084,0.001916,0,...,0.95919,0.04081,0,0,0,0,0,0,0,0
258,LOCAL REPORTER IN DEEP BLUE STATE Stuns Liberals When He Goes Rogue…Tells Truth About Guns [VIDEO],local reporter in deep blue state stuns liberals when he goes rogue…tells truth about guns [video],local reporter in deep blue state stuns liberals when he goes rogue tells truth about guns _mytag_parentheses_,JJ NNP NNP NNP NNP NNP NNP NNP WRB PRP VBZ NNP NNP IN NNP NN,local reporter deep blue state stuns liberal go rogue tells truth gun _mytag_parentheses_,1,1,7e-05,0.99993,1,...,0.000865,0.999135,0,0,1,0,0,0,0,0
329,HILLARY Calls On CRANKY SOCIALIST She Stole Election From To Sway Free Sh*t Voters [VIDEO],hillary calls on cranky socialist she stole election from to sway free sh*t voters [video],hillary calls on cranky socialist she stole election from to sway free _mytag_slang_ voters _mytag_parentheses_,NNP NNP IN NNP NNP PRP NNP NN IN TO NNP NNP NNP NNP VBD,hillary call cranky socialist stole election sway free _mytag_slang_ voter _mytag_parentheses_,1,1,0.000172,0.999828,1,...,0.057281,0.942719,0,0,1,1,0,0,1,0
148,"For Chinese officials, Trump perhaps better the devil they don't know","for chinese officials, trump perhaps better the devil they don't know",for chinese officials trump perhaps better the devil they do n' t know,IN JJ NNS NNP RB VBD DT NN PRP VBP RB VB,chinese official trump perhaps better devil know,0,1,0.029694,0.970306,1,...,0.703207,0.296793,0,0,0,0,0,0,0,0


Index(['org_title', 'lower_title', 'cleaned_words', 'cleaned_pos',
       'minimal_words', 'y_true', 'nb_og', 'nb_og_p0', 'nb_og_p1', 'nb_lo',
       'nb_lo_p0', 'nb_lo_p1', 'nb_cw', 'nb_cw_p0', 'nb_cw_p1', 'nb_ps',
       'nb_ps_p0', 'nb_ps_p1', 'nb_mw', 'nb_mw_p0', 'nb_mw_p1', 'fnn_og',
       'fnn_og_p0', 'fnn_og_p1', 'fnn_lo', 'fnn_lo_p0', 'fnn_lo_p1', 'fnn_cw',
       'fnn_cw_p0', 'fnn_cw_p1', 'fnn_ps', 'fnn_ps_p0', 'fnn_ps_p1', 'fnn_mw',
       'fnn_mw_p0', 'fnn_mw_p1', 'lstm_og', 'lstm_og_p0', 'lstm_og_p1',
       'lstm_lo', 'lstm_lo_p0', 'lstm_lo_p1', 'lstm_cw', 'lstm_cw_p0',
       'lstm_cw_p1', 'lstm_ps', 'lstm_ps_p0', 'lstm_ps_p1', 'lstm_mw',
       'lstm_mw_p0', 'lstm_mw_p1', 'too_long', 'noisy', 'clickbait', 'slang',
       'donald', 'obama', 'hillary', 'bernie'],
      dtype='object')
