# Set constant

# Load libaries

In [1]:
from freq_utils import *

import regex as re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical
from collections import Counter

from joblib import dump, load

import time
import IPython


pd.options.display.max_colwidth = 200

# Load dataset

In [2]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

# Make dictionaries
- Make dictionaries
- Get max_len

### Pretrained word embeddings
- Word to index
- Word to vector

In [3]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [4]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(len(pos_to_index),pos_to_index)

39 {'dt': 0, 'sym': 1, 'in': 2, 'wp$': 3, 'nn': 4, ',': 5, ')': 6, 'vbg': 7, 'rbr': 8, 'jjs': 9, 'rbs': 10, 'to': 11, 'jj': 12, 'rb': 13, 'vb': 14, 'wp': 15, 'prp': 16, 'fw': 17, ':': 18, 'jjr': 19, 'prp$': 20, 'vbz': 21, 'vbn': 22, 'nns': 23, 'pos': 24, '.': 25, '(': 26, 'wdt': 27, 'nnps': 28, "''": 29, 'uh': 30, 'cc': 31, 'cd': 32, 'vbp': 33, 'wrb': 34, 'vbd': 35, 'ex': 36, 'md': 37, 'nnp': 38}


### Get max_len

In [5]:
xcol_names = df0.columns[:-1].to_list()
input_dict = {}
for x in xcol_names:
    print(x,'\t', df0[x].str.split().str.len().max(), df1[x].str.split().str.len().max())
    input_dict[x]=df1[x].str.split().str.len().max()
print(input_dict)

org_title 	 20 42
lower_title 	 20 42
cleaned_words 	 24 49
cleaned_pos 	 24 49
minimal_words 	 15 35
{'org_title': 42, 'lower_title': 42, 'cleaned_words': 49, 'cleaned_pos': 49, 'minimal_words': 35}


# Define models

In [6]:
def simple_Manual(test):
    
    y_true = test.label
    y_pred = []
    
    # title size > 20?
    y_pred.append( test.apply(lambda row: 1 if len(row['lower_title'].split())> 20 else 0, axis=1) )
    # noise > 3 
    y_pred.append( test.apply(lambda row: 1 if len(re.findall(re.compile('[^\s\w]'), row['lower_title'])) > 5 else 0, axis=1) )
    # clickbait, slang, first names
    trigger_word = ['_mytag_slang_',
                    'donald','obama','hillary','bernie']

    for i in range(len(trigger_word)):
        y_pred.append( test.minimal_words.str.contains(trigger_word[i])*1 )

    cut_name = ['too_long','noisy','slang'] + trigger_word[-4:]

    return cut_name, y_true, y_pred

In [7]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname]#.tolist() -> commented out to keep dataframe index
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    temp_real = train[train.label==0]
    temp_fake = train[train.label==1]
    
    counter_real = Counter(temp_real[Xname].str.split().explode().tolist())
    counter_fake = Counter(temp_fake[Xname].str.split().explode().tolist())

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    model_name = 'Naive Bayes - '+Xname
    
    
    return model_name, predict, Y_test, proba, classifier, counter, counter_real, counter_fake

In [8]:
def input_encoder(X, trainable = True):
    
    # X input can be either text or PoS vectors
    # dim=2 for text -> word index input (m, max_len)
    # dim=3 for PoS -> one-hot encoding input (m, max_len, num_cat)
    dim = len(X.get_shape().as_list())
    
    if dim==2:
        # Word embedding, indices to vector
        # Output: (m, max_len, emb_dim)
        X = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)(X)             
    elif dim==3:
        # Int to float for One-hot encoding
        # Output: (m, max_len, num_cat)
        X = tf.dtypes.cast(X, tf.float32)
    else:
        print('Wrong input shape:', X.get_shape())
        
    # Skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)    
    
    return X

def ml_builder(X, best_hps, ml_type):

    # Hyperparameters
    drop_out = best_hps.values['drop_out']
    
    if ml_type=='FNN':
        
        # Take average of a sentence
        max_len = X.shape[1]
        X_avg = [ X[:,i,:] for i in range(max_len) ]
        X = tf.keras.layers.Average()(X_avg)    
        
        n_units = [best_hps.values['n_unit1'], best_hps.values['n_unit2'], 16]
        
        # Linear+ReLu layer
        for n_unit in n_units:
            X = tfl.Dense(units = n_unit, activation='relu', kernel_initializer='he_normal')(X)
            X = tfl.Dropout(rate = drop_out)(X)  
            X = tfl.BatchNormalization()(X)
    
        
    elif ml_type=='LSTM':
        
        n_unit = best_hps.values['n_unit']
        re_drop_out = best_hps.values['re_drop_out']
        
        # Output: a[l] (m, max_len, # hidden unit), batch of sequences
        X = tfl.LSTM(units = n_unit, dropout = drop_out, recurrent_dropout=re_drop_out, return_sequences= True)(X)
        X = tfl.LSTM(units = n_unit, dropout = drop_out, recurrent_dropout=re_drop_out, return_sequences= True)(X)
        # Output: a[l] (m, # hidden unit)
        X = tfl.LSTM(units = n_unit, dropout = drop_out, recurrent_dropout=re_drop_out, return_sequences= False)(X)

    else:
        print('Wrong ml_type:',ml_type)
        
            
    # Linear+Softmax layer
    # Output: y (m, # classes=2), probability of each class
    X = tfl.Dense(units = 2, activation='softmax')(X)
    
    return X

def ml_optimizer(model, best_hps):
    
    learning_rate = best_hps.values['learning_rate']

    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)                                
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

    return model

class DeepModelBuilder():
    def __init__(self, input_shape, best_hps, ml_type='LSTM', trainable=True):
        
        self.input_shape = input_shape
        self.best_hps = best_hps
        self.ml_type = ml_type
        self.trainable = trainable
    
    def build(self):

        X_input = tfl.Input(shape=self.input_shape, dtype='int32')

        X = input_encoder(X_input, trainable = self.trainable)
        X = ml_builder(X, best_hps=self.best_hps, ml_type=self.ml_type)

        model = tf.keras.models.Model(inputs=X_input, outputs=X)
        model = ml_optimizer(model, best_hps=self.best_hps)

        return model
    
    def fit(self, model, x_train, y_train, x_val, y_val, epochs):
        
        class ClearTrainingOutput(tf.keras.callbacks.Callback):
            def on_train_end(*args, **kwargs):
                IPython.display.clear_output(wait = True)
        
        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=5)

        return model.fit(x_train, y_train,
                         validation_data=(x_val, y_val),
                         epochs=epochs,
                         batch_size=self.best_hps.values['batch_size'], shuffle=True, 
                         callbacks=[ClearTrainingOutput(), es])

# Generate inputs of the model

In [9]:
# Naive Bayes input
train, dev, test = train_dev_test_split([df0, df1], m=40000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.05, rand_state=42)
# Deep learning model input
def make_input(xcol_name, train, dev, test):

    max_len = input_dict[xcol_name]
    use_embeddings = True
    if xcol_name == 'cleaned_pos':
        use_embeddings=False

    w2i = False
    w2v = False
    X_shape = False

    # Embedding or One-hot encoding
    if use_embeddings:
        w2i = word_to_index        
        w2v = word_to_vector
    else:
        w2i = pos_to_index

    _, _, X_train_indices, _, Y_train_oh = dataframe_to_arrays(train, w2i, max_len, Xname=xcol_name)
    _, _, X_dev_indices,   _, Y_dev_oh   = dataframe_to_arrays(dev, w2i, max_len, Xname=xcol_name)
    index, _, X_test_indices, Y_test, _  = dataframe_to_arrays(test, w2i, max_len, Xname=xcol_name)

    # X, Y (train, dev, test)
    X = False
    if use_embeddings:
        X = [X_train_indices, X_dev_indices, X_test_indices]
    else:
        X = [to_categorical(X_train_indices, num_classes=len(pos_to_index)), 
             to_categorical(X_dev_indices, num_classes=len(pos_to_index)), 
             to_categorical(X_test_indices, num_classes=len(pos_to_index))]            

    Y = [Y_train_oh, Y_dev_oh, Y_test]
    
    print('input column:',xcol_name)
    print('max_len:',max_len)
    print('use_embeddings?:',use_embeddings)
    print('X, Y shapes:', np.shape(X[0]), np.shape(Y[0]))
    
    # X shape:
    # X_oh (m, max_len, num_cat)
    # X_indices (m, max_len)
    
    return X, Y

# Model wrapper functions

In [10]:
def run_multiple_NB(model_vars,train,dev,test,Yname='label'):

    results = []
    imodel = 0
    
    for var in model_vars:
        
        Xname = var
        
        # Bagging
        sample_train = pd.concat([train,dev]).sample(frac=0.8/16,replace=True)
        sample_dev = pd.concat([train,dev]).sample(frac=0.2/16,replace=True)
        
        model_name, y_pred, y_true, proba, classifier, counter, counter_real, counter_fake = \
            simple_NB(sample_train,sample_dev,test,Xname=Xname,Yname=Yname)
        
        x = test[Xname].to_numpy()
        
        model_name = model_name+' '+Xname
        
        save_name = 'data/nb'+str(imodel)
        dump(classifier,save_name)
        save_name = 'data/nb_counter'+str(imodel)
        dump(counter,save_name)
        save_name = 'data/nb_counter_real'+str(imodel)
        dump(counter_real,save_name)
        save_name = 'data/nb_counter_fake'+str(imodel)
        dump(counter_fake,save_name)
        
        results.append([model_name, x, y_pred, y_true, proba, classifier, counter, counter_real, counter_fake])
        
        imodel+=1
        
    return results

In [11]:
def run_multiple_NN(trainable=True):
       
    durations=[]
    results = []
    imodel = 0

    for ml_type in ('LSTM','FNN'):
        for xcol_name in xcol_names:

            #if not (ml_type=='FNN' and xcol_name=='cleaned_pos'):
            #    continue

            begin_time = time.time()
            
            sample_train = pd.concat([train,dev]).sample(frac=0.8/16,replace=True)
            sample_dev = pd.concat([train,dev]).sample(frac=0.2/16,replace=True)

            X, Y  = make_input(xcol_name, sample_train, sample_dev, test)
            
            dir_name = ml_type+'_'+xcol_name

            # Get hyperparameter dictionary       
            save_name = 'data/'+dir_name+'_best_hps'
            best_hps=load(save_name)
            save_name = 'data/'+dir_name+'_best_hps_reg'
            best_hps_reg=load(save_name)
            best_hps.values.update(best_hps_reg.values)

            # Build a model and train
            mb = DeepModelBuilder(X[0][0].shape, best_hps, ml_type=ml_type, trainable=trainable)
            model = mb.build()
            history = mb.fit(model, X[0], Y[0], X[1], Y[1], epochs=100)

            # Test
            proba = model.predict(X[2])
            y_pred = [np.argmax(proba[i]) for i in range(len(proba))]
            y_true = Y[2]
            x = X[2]

            #save_name = 'data/nn'+str(i)
            #model.save(save_name)
            save_name = 'data/nn_history'+str(imodel)
            dump(history,save_name)

            results.append([dir_name, x, y_pred, y_true, proba, model, history])
            
            imodel+=1
            
            end_time = time.time()
            durations.append([ml_type,xcol_name,(end_time-begin_time)/60,'min.'])

    return results, durations

# Train models

In [12]:
# Naive Bayes
res_nb =  run_multiple_NB(xcol_names,train,dev,test,Yname='label')
    
# Neural Networks
res_deep, durations = run_multiple_NN(trainable=True)
    
# Add NB and NN
results = res_nb + res_deep

print(np.array([x[2] for x in durations]).sum(), durations) # 5 min. for 1000/16, 40 min. for 40000/16

INFO:tensorflow:Assets written to: ram://b4a0db06-f7d3-400c-ac9a-ae9db63d54f3/assets


INFO:tensorflow:Assets written to: ram://b4a0db06-f7d3-400c-ac9a-ae9db63d54f3/assets
  layer_config = serialize_layer_fn(layer)
  return generic_utils.serialize_keras_object(obj)


38.73258802890778 [['LSTM', 'org_title', 4.368123201529185, 'min.'], ['LSTM', 'lower_title', 5.010114284356435, 'min.'], ['LSTM', 'cleaned_words', 4.52064251502355, 'min.'], ['LSTM', 'cleaned_pos', 11.574068331718445, 'min.'], ['LSTM', 'minimal_words', 4.818503717581431, 'min.'], ['FNN', 'org_title', 1.1048944354057313, 'min.'], ['FNN', 'lower_title', 2.127064367135366, 'min.'], ['FNN', 'cleaned_words', 2.529555829366048, 'min.'], ['FNN', 'cleaned_pos', 0.15000534852345784, 'min.'], ['FNN', 'minimal_words', 2.5296159982681274, 'min.']]


# Print result

In [13]:
def print_result(results):
    
    for result in results:
        
        model_name, x, y_pred, y_true, proba = result[:5]
        
        y_true = y_true.to_numpy()
        
        print(model_name)
        print('accuracy: ',accuracy_score(y_true, y_pred))
        print('precision: ',precision_score(y_true, y_pred))
        print('recall: ',recall_score(y_true, y_pred))
        print('f1: ',f1_score(y_true, y_pred))
        print('\n')
print_result(results)

Naive Bayes - org_title org_title
accuracy:  0.9255
precision:  0.9140096618357488
recall:  0.9403578528827038
f1:  0.9269965703086723


Naive Bayes - lower_title lower_title
accuracy:  0.922
precision:  0.9032258064516129
recall:  0.9463220675944334
f1:  0.9242718446601942


Naive Bayes - cleaned_words cleaned_words
accuracy:  0.9305
precision:  0.9270935960591133
recall:  0.9353876739562624
f1:  0.9312221672439387


Naive Bayes - cleaned_pos cleaned_pos
accuracy:  0.807
precision:  0.8003875968992248
recall:  0.8210735586481114
f1:  0.8105986261040236


Naive Bayes - minimal_words minimal_words
accuracy:  0.9035
precision:  0.9118541033434651
recall:  0.8946322067594433
f1:  0.9031610637230307


LSTM_org_title
accuracy:  0.9065
precision:  0.9261186264308012
recall:  0.8846918489065606
f1:  0.9049313675648195


LSTM_lower_title
accuracy:  0.8635
precision:  0.7977254264825345
recall:  0.9761431411530815
f1:  0.8779615556548949


LSTM_cleaned_words
accuracy:  0.934
precision:  0.94410

# Organize results into a dataframe

In [14]:
df_result = []

for i in range(len(results)):

    df = pd.DataFrame({
        'x': results[i][1].tolist(),
        'y_true': results[i][3].to_numpy().tolist(),
        'y_pred': results[i][2],
        'proba0': [results[i][4][j][0] for j in range(len(results[i][1])) ],
        'proba1': [results[i][4][j][1] for j in range(len(results[i][1])) ]
    }, index = results[i][3].index)
    df_result.append(df)

In [15]:
seq_type = ['Original','Lower','CleanedWords','PoS','MinimalWords']
ml_type = ['NaiveBayes','FNN','LSTM']
seq_type_abb = ['og','lo','cw','ps','mw']
ml_type_abb = ['nb','fnn','lstm']

n_ml = len(ml_type)
n_seq = len(seq_type)

title_vars = ['org_title','lower_title','cleaned_words','cleaned_pos','minimal_words']

dict_name = []

for i in range(n_ml):
    for j in range(n_seq):
        
        y_pred = ml_type_abb[i]+'_'+seq_type_abb[j]
        proba0 = y_pred+'_p0'
        proba1 = y_pred+'_p1'
        temp_dict = {'y_pred':y_pred,'proba0':proba0,'proba1':proba1}
        
        if i==0:
            x = title_vars[j]
            temp_dict['x'] = x
        
        dict_name.append(temp_dict)

df = []


df.append(df_result[0].rename(columns = dict_name[0]))
for i in range(1,n_seq):
    df.append(df_result[i].drop(['y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))

for i in range(n_seq, n_ml*n_seq):
    df.append(df_result[i].drop(['x','y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))


In [16]:
df_x = df[0][title_vars[0]].to_frame()
for i in range(1,n_seq):
    df_x = pd.merge(df_x, df[i][title_vars[i]].to_frame(), left_index=True, right_index=True)
    #print(i,len(df_x))

df_y = df[0].y_true.to_frame() 
print('y',len(df_y))
    
df[0].drop(['y_true'], axis=1, inplace=True)
for i in range(0,n_seq):
    df[i].drop([title_vars[i]], axis=1, inplace=True)

df_p = df[0]    
for i in range(1,n_seq*n_ml):
    df_p = pd.merge(df_p,df[i], left_index=True, right_index=True)
    
df_ml = pd.merge(df_x, df_y, left_index=True, right_index=True)
df_ml = pd.merge(df_ml, df_p, left_index=True, right_index=True)

y 2000


In [17]:
cut_name, y_true_cut, y_pred_cut = simple_Manual(test)

In [18]:
dict_manual = {cut_name[i]:y_pred_cut[i] for i in range(len(cut_name))}

df_manual = pd.DataFrame(dict_manual)

#display(df_manual)

In [19]:
df_final = pd.merge(df_ml, df_manual, left_index=True, right_index=True)


In [20]:
display(df_final.sample(10))

print(df_final.columns)

df_final.to_csv('data/model_compare4.csv',index=False)

Unnamed: 0,org_title,lower_title,cleaned_words,cleaned_pos,minimal_words,y_true,nb_og,nb_og_p0,nb_og_p1,nb_lo,...,lstm_mw,lstm_mw_p0,lstm_mw_p1,too_long,noisy,slang,donald,obama,hillary,bernie
23128,MARINE ARRESTED FOR Complaining About Government On Facebook Is Suing Government [VIDEO],marine arrested for complaining about government on facebook is suing government [video],marine arrested for complaining about government on facebook is suing government [ video ],NNP NN IN VBG IN NN IN NN NN VBG NN NN NN NN,marine arrested complain government facebook sue government video,1,1,0.00094,0.99906,1,...,1,0.0007,0.9993,0,0,0,0,0,0,0
39728,"MUSLIM MEN WIN BIG DISCRIMINATION SUIT Against Employer For Violating Religious Beliefs, While Nuns, Christian Bakers And City Clerks All Lose Cases","muslim men win big discrimination suit against employer for violating religious beliefs, while nuns, christian bakers and city clerks all lose cases","muslim men win big discrimination suit against employer for violating religious beliefs , while nuns , christian bakers and city clerks all lose cases","NN NN VB NN NN NN IN NN IN VBG JJ NNS , IN NNS , JJ NNS CC NNP NNS DT VB NNS",muslim men win big discrimination suit employer violate religious belief nun christian baker city clerk lose case,1,1,4e-06,0.999996,1,...,1,0.00022,0.99978,1,0,0,0,0,0,0
7651,Republican U.S. Senator Cochran postpones return to Washington,republican u.s. senator cochran postpones return to washington,republican _u_s_ senator cochran postpones return to washington,JJ NNP NN NN NNS NN TO NNP,republican _u_s_ senator cochran postpones return washington,0,0,0.993132,0.006868,0,...,0,0.999999,1e-06,0,0,0,0,0,0,0
33950,POLICE UNION Threatens 49er’s With BOYCOTT: TAKE ACTION Against Bench-Warmer Kaepernick’s “Inappropriate Behavior” Or We May Choose To “Not Work At Your Facilities”,police union threatens 49er’s with boycott: take action against bench-warmer kaepernick’s “inappropriate behavior” or we may choose to “not work at your facilities”,police union threatens _digit_ er’s with boycott : take action against bench _ warmer kaepernick’s “inappropriate behavior” or we may choose to “not work at your facilities”,NNS NN NNS NN NN IN NN : VB NN IN NN NN NN NNP NN NN CC PRP NNP VB TO NN NN IN PRP$ NN,police union threatens _digit_ boycott take action bench warmer kaepernick inappropriate behavior may choose work facility,1,1,0.229152,0.770848,1,...,0,0.975039,0.024961,1,1,0,0,0,0,0
20197,Trump HUMILIATED As WH Walks Back His Attack After Soldier’s Mom Confirms His Insensitive Phone Call,trump humiliated as wh walks back his attack after soldier’s mom confirms his insensitive phone call,trump humiliated as wh walks back his attack after soldier’s mom confirms his insensitive phone call,NN NN IN NN NNS RB PRP$ NN IN NN NN NNS PRP$ JJ NN VB,trump humiliated walk back attack soldier mom confirms insensitive phone call,1,1,0.001816,0.998184,1,...,1,0.001822,0.998178,0,0,0,0,0,0,0
4852,Congress should consider help for Puerto Rico's disabled: Task force,congress should consider help for puerto rico's disabled: task force,congress should consider help for puerto rico 's disabled : task force,NNP MD VB NN IN NN NNP POS JJ : NN NN,congress consider help puerto rico disabled task force,0,0,0.997361,0.002639,0,...,1,0.422287,0.577713,0,0,0,0,0,0,0
27112,CLINTON MEGA-CHARITY: “Slush Fund For The Clinton’s” Took In $140 Million… Gave Pittance In Direct Aid,clinton mega-charity: “slush fund for the clinton’s” took in $140 million… gave pittance in direct aid,clinton mega _ charity : “slush fund for the clinton’s” took in _digit_ million… gave pittance in direct aid,NN NN NN NN : NN NN IN DT NN NN IN NN NN VB NN IN JJ NN,clinton mega charity slush fund clinton took _digit_ million give pittance direct aid,1,1,0.063773,0.936227,1,...,0,0.965185,0.034815,0,1,0,0,0,0,0
15154,"Trump denounces attack in London, urges 'proactive' steps","trump denounces attack in london, urges 'proactive' steps","trump denounces attack in london , urges 'proactive ' steps","NN NNS NN IN NNP , NNS JJ '' NNS",trump denounces attack london urge proactive step,0,0,0.999822,0.000178,0,...,0,0.998154,0.001846,0,0,0,0,0,0,0
25290,Democrat Spills The Beans On Why Hispanics Love Obamacare So Much [Video],democrat spills the beans on why hispanics love obamacare so much [video],democrat spills the beans on why hispanics love obamacare so much [ video ],NNP NNS DT NNS IN WRB NNS VB NN RB JJ NN NN NN,democrat spill bean hispanic love obamacare much video,1,1,1e-06,0.999999,1,...,1,0.001726,0.998274,0,0,0,0,1,0,0
15696,Trump says he's sure Senator Paul will back Republican health plan,trump says he's sure senator paul will back republican health plan,trump says he 's sure senator paul will back republican health plan,NN VBZ PRP POS NN NN NNP MD RB JJ NN NN,trump say sure senator paul back republican health plan,0,0,0.645853,0.354147,1,...,0,0.998833,0.001168,0,0,0,0,0,0,0


Index(['org_title', 'lower_title', 'cleaned_words', 'cleaned_pos',
       'minimal_words', 'y_true', 'nb_og', 'nb_og_p0', 'nb_og_p1', 'nb_lo',
       'nb_lo_p0', 'nb_lo_p1', 'nb_cw', 'nb_cw_p0', 'nb_cw_p1', 'nb_ps',
       'nb_ps_p0', 'nb_ps_p1', 'nb_mw', 'nb_mw_p0', 'nb_mw_p1', 'fnn_og',
       'fnn_og_p0', 'fnn_og_p1', 'fnn_lo', 'fnn_lo_p0', 'fnn_lo_p1', 'fnn_cw',
       'fnn_cw_p0', 'fnn_cw_p1', 'fnn_ps', 'fnn_ps_p0', 'fnn_ps_p1', 'fnn_mw',
       'fnn_mw_p0', 'fnn_mw_p1', 'lstm_og', 'lstm_og_p0', 'lstm_og_p1',
       'lstm_lo', 'lstm_lo_p0', 'lstm_lo_p1', 'lstm_cw', 'lstm_cw_p0',
       'lstm_cw_p1', 'lstm_ps', 'lstm_ps_p0', 'lstm_ps_p1', 'lstm_mw',
       'lstm_mw_p0', 'lstm_mw_p1', 'too_long', 'noisy', 'slang', 'donald',
       'obama', 'hillary', 'bernie'],
      dtype='object')
