# Set constant

In [1]:
max_len = 20 # max sentence size

# Load libaries

In [2]:
from freq_utils import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from tensorflow.keras.utils import to_categorical

pd.options.display.max_colwidth = 200

# Load dataset

In [3]:
df0 = pd.read_csv('data/TrueOrganized.csv')
df1 = pd.read_csv('data/FakeOrganized.csv')
df0['label'] = 0
df1['label'] = 1

In [4]:
df = df0.sample(2)
display(df)

Unnamed: 0.1,Unnamed: 0,title,pos,cleaned_words,cleaned_pos,minimal_words,org_title,label
10376,10376,"Trump snags former rival's backing, scraps Chicago rally","[('Trump', 'NNP'), ('snags', 'NNS'), ('former', 'JJ'), ('rival', 'NN'), (""'s"", 'POS'), ('backing', 'NN'), (',', ','), ('scraps', 'NNS'), ('Chicago', 'NNP'), ('rally', 'NN')]",trump snags former rival s backing scraps chicago rally,NNP NNS JJ NN POS NN NNS NNP NN,trump snag former rival backing scrap chicago rally,"Trump snags former rival's backing, scraps Chicago rally",0
13235,13235,Bounce for Australian PM as voters tire of leadership roundabout,"[('Bounce', 'NN'), ('for', 'IN'), ('Australian', 'JJ'), ('PM', 'NNP'), ('as', 'IN'), ('voters', 'NNS'), ('tire', 'VBP'), ('of', 'IN'), ('leadership', 'NN'), ('roundabout', 'NN')]",bounce for australian pm as voters tire of leadership roundabout,NN IN JJ NNP IN NNS VBP IN NN NN,bounce australian voter tire leadership roundabout,Bounce for Australian PM as voters tire of leadership roundabout,0


# Make dictionaries

### Pretrained word embeddings
- Word to index
- Word to vector

In [5]:
word_to_index, index_to_word, word_to_vector = get_pretrained_embedding()

### PoS tag encodings
- PoS word to index

In [6]:
df = pd.concat([df0.cleaned_pos, df1.cleaned_pos])

pos_set = set()
for x in list(df.str.lower().str.split()):
    pos_set.update(x)

pos_list = list(pos_set)
pos_to_index = { pos_list[i]: i for i in range(len(pos_list)) }

print(pos_to_index)

{'jj': 0, 'jjr': 1, 'ex': 2, 'wdt': 3, 'uh': 4, 'vbp': 5, 'nnps': 6, 'vbn': 7, 'pos': 8, 'vbz': 9, 'wp$': 10, ':': 11, "''": 12, 'wrb': 13, 'nn': 14, 'rp': 15, 'jjs': 16, 'cc': 17, '$': 18, 'dt': 19, 'nns': 20, 'wp': 21, 'to': 22, 'rbr': 23, 'pdt': 24, 'vbg': 25, 'prp': 26, 'sym': 27, 'vbd': 28, 'fw': 29, 'in': 30, 'prp$': 31, 'vb': 32, 'rbs': 33, 'rb': 34, '.': 35, 'cd': 36, 'nnp': 37, 'md': 38}


In [7]:
len(pos_to_index)

39

# Train/dev/test split

In [8]:
train, dev, test = train_dev_test_split([df0, df1], m=20000, class_column='label', 
                                    class_balance=True, r_dev=0.2, r_test=0.2, rand_state=42)

# Define models

In [9]:
def simple_NB(train,dev,test,Xname='title',Yname='label'):
    
    train = pd.concat([train,dev])
    
    X_train = train[Xname].tolist()
    Y_train = train[Yname].tolist()

    X_test = test[Xname].tolist()
    Y_test = test[Yname].tolist()
    
    counter = CountVectorizer()

    counter.fit(X_train+X_test)

    train_counts = counter.transform(X_train)
    test_counts = counter.transform(X_test)

    #print(counter.vocabulary_)

    classifier = MultinomialNB()
    classifier.fit(train_counts,Y_train)
    
    predict = classifier.predict(test_counts)
    
    proba = classifier.predict_proba(test_counts)
    
    model_name = 'Naive Bayse - '+Xname
    
    
    return model_name, predict, Y_test, proba, classifier, counter

In [10]:
def model_FNN(input_shape, word_to_index, word_to_vector, n_class=2, trainable=False):
    '''
    input_shape: (max_len,)
    word_to_index: word to index dictionary
    word_to_vector: word to embedding vector dictionary

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')
    
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
    else:
        # One hot encoding (=input)
        # Output: (m, max_len, num_cat)
        X = tf.dtypes.cast(X, tf.float32)    

    # Masking layer
    # skip zero vector words
    X = tfl.Masking(mask_value=0.)(X)
    
    # Take average
    max_len = X.shape[1]
    # Make a list from slice
    X_avg = [ X[:,i,:] for i in range(max_len) ]
    # Take average of a sentence
    X = tf.keras.layers.Average()(X_avg)    
        
    # Linear+ReLu layer
    X = tfl.Dense(units = 128, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X)  

    # Linear+ReLu layer
    X = tfl.Dense(units = 64, activation='relu')(X)
    X = tfl.Dropout(rate = 0.4)(X) 

    # Linear+ReLu layer
    X = tfl.Dense(units = 32, activation='relu')(X)
    X = tfl.Dropout(rate = 0.2)(X) 

    # Linear+Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Dense(units = n_class, activation='softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

In [11]:
def model_LSTM(input_shape, word_to_index, word_to_vector=False, n_class=2, trainable=False):
    '''
    input_shape: (max_len,) or (max_len, num_cat)
    word_to_index: word to index dictionary, False for one hot encoding
    word_to_vector: word to embedding vector dictionary, False for one hot encoding

    return model

    then
    X: Indices of a sentence (m, max_len)
    Y: Class probability, one hot vector (m, # classes)
    '''

    # Input layer
    # X_oh (m, max_len, num_cat)
    # X_indices (m, max_len)
    X_input = tfl.Input(shape=input_shape, dtype='int32')

    # Embedding layer
    #embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
    #X = embedding_layer(X_indices)   

    # By default, assumes one hot vector input
    # If word_to_index, word_to_vector is provided, add an embedding layer
    X = X_input
    if bool(word_to_vector):
        # Word embeding
        # Output: (m, max_len, emb_dim)
        # Embedding layer
        embedding_layer = pretrained_embedding_layer(word_to_vector, word_to_index, trainable=trainable)
        X = embedding_layer(X)      
        # Masking layer
        # skip zero vector words
        X = tfl.Masking(mask_value=0.)(X)
        
    else:
        X = tf.dtypes.cast(X, tf.float32)

    # LSTM layer
    # Output: a[1] (m, max_len, 128 hidden unit), batch of sequences
    X = tfl.LSTM(units = 128, return_sequences= True)(X)
    X = tfl.Dropout(rate = 0.5 )(X) 

    # LSTM layer
    # Output: a[2]<max_len> (m, 128 hidden unit)
    X = tfl.LSTM(units = 128)(X)
    X = tfl.Dropout(rate = 0.5)(X)  

    # Linear layer
    # Output: a[3] (m, # classes)
    X = tfl.Dense(units = n_class)(X)

    # Softmax layer
    # Output: y (m, # classes), probability of each class
    X = tfl.Activation('softmax')(X)

    # Model
    model = tf.keras.models.Model(inputs=X_input, outputs=X)

    return model

# Model wrapper functions

In [12]:
def run_multiple_NB(model_vars,train,dev,test,Yname='label'):

    results = []

    for var in model_vars:
        
        Xname = var
        
        model_name, y_pred, y_true, proba, classifier, counter = \
            simple_NB(train,dev,test,Xname=Xname,Yname=Yname)
        
        x = test[Xname].to_numpy()
        
        model_name = model_name+' '+Xname
        
        results.append([model_name, x, y_pred, y_true, proba, classifier, counter])
        
    return results

In [13]:
def run_multiple_NN(model_vars, train, dev, test, pos_to_index, word_to_index=False, word_to_vector=False, 
                        Yname='label', max_len=20, n_class=2, 
                        epochs = 20, batch_size = 32, patience=2, trainable=False):
    
    
    results = []

    for i in range(len(model_vars)):
            
        model_name = model_vars[i][0]
        func_model = model_vars[i][1]
        Xname = model_vars[i][2]
        use_embeddings = model_vars[i][3]
        
        model_name = model_name+' '+Xname
        
        w2i = False
        w2v = False
        X_shape = False
        
        if use_embeddings:
            w2i = word_to_index        
            w2v = word_to_vector
            X_shape = (max_len, )
        else:
            w2i = pos_to_index
            X_shape = (max_len, len(pos_to_index))

        _, _, X_train_indices, _, Y_train_oh = dataframe_to_arrays(train, w2i, max_len, Xname=Xname)
        _, _, X_dev_indices,   _, Y_dev_oh   = dataframe_to_arrays(dev, w2i, max_len, Xname=Xname)
        index, _, X_test_indices, Y_test, _  = dataframe_to_arrays(test, w2i, max_len, Xname=Xname)

        
        X = False
        if use_embeddings:
            X = [X_train_indices, X_dev_indices, X_test_indices]
        else:
            X = [to_categorical(X_train_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_dev_indices, num_classes=len(pos_to_index)), 
                 to_categorical(X_test_indices, num_classes=len(pos_to_index))]            

        
        Y = [Y_train_oh, Y_dev_oh, Y_test]

        
    
        print('should match all: ', X_shape, X[0].shape, X[1].shape, X[2].shape)
    
        model = func_model(X_shape, w2i, w2v, n_class, trainable)
        model.summary()
                    
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)  
        
        history = False
        
        if patience :
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]),
                                callbacks=[es])
        else:
            history = model.fit(X[0], Y[0], 
                                epochs = epochs, batch_size = batch_size, shuffle=True, 
                                validation_data=(X[1], Y[1]))
            
        proba = model.predict(X[2])
        y_pred = [np.argmax(proba[i]) for i in range(len(proba))]
        y_true = Y[2]
        x = X[2]
        
        results.append([model_name, x, y_pred, y_true, proba, model, history])
    
    return results

# Run models

In [14]:
# Naive Bayes
#simple_NB(train,dev,test,Xname='title',Yname='label') 
nn_vars = ['org_title','cleaned_words','minimal_words','cleaned_pos' ]
res_nb =  run_multiple_NB(nn_vars,train,dev,test,Yname='label')
    

# NN
nn_vars = [['LSTM', model_LSTM, 'org_title', True],
            ['LSTM', model_LSTM, 'cleaned_words', True], 
              ['LSTM', model_LSTM, 'cleaned_pos', False],
           ['LSTM', model_LSTM, 'minimal_words', True],
             ['FNN', model_FNN, 'org_title', True],
            ['FNN', model_FNN, 'cleaned_words', True],
            ['FNN', model_FNN, 'minimal_words', True],
            ['FNN', model_FNN, 'cleaned_pos', False]]

res_nn = run_multiple_NN(nn_vars, train, dev, test,
                         pos_to_index, word_to_index, word_to_vector, 
                         Yname='label', max_len=max_len, n_class=2,
                         epochs = 50, batch_size = 32, patience=4, trainable=True)

# Add NB and NN
results = res_nb + res_nn


should match all:  (20,) (12000, 20) (4000, 20) (4000, 20)


2022-02-19 14:41:37.237836: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding (Embedding)       (None, 20, 50)            20000050  
                                                                 
 masking (Masking)           (None, 20, 50)            0         
                                                                 
 lstm (LSTM)                 (None, 20, 128)           91648     
                                                                 
 dropout (Dropout)           (None, 20, 128)           0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0     

Epoch 10/50
Epoch 11/50
Epoch 00011: early stopping
should match all:  (20,) (12000, 20) (4000, 20) (4000, 20)
Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 20, 50)            20000050  
                                                                 
 masking_2 (Masking)         (None, 20, 50)            0         
                                                                 
 lstm_6 (LSTM)               (None, 20, 128)           91648     
                                                                 
 dropout_6 (Dropout)         (None, 20, 128)           0         
                                                                 
 lstm_7 (LSTM)               (None, 128)               131584    
              

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20,) (12000, 20) (4000, 20) (4000, 20)
Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 20, 50)       20000050    ['input_6[0][0]']                
                                                                                                  
 masking_4 (Masking)            (None, 20, 50)       0           ['embedding_4[0][0]']            
                                                                                                  
 tf.__operators__.getitem_20 (S  (None

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20,) (12000, 20) (4000, 20) (4000, 20)
Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 20, 50)       20000050    ['input_7[0][0]']                
                                                                                                  
 masking_5 (Masking)            (None, 20, 50)       0           ['embedding_5[0][0]']            
                                                                                                  
 tf.__operators__.getitem_40 (S  (None

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping
should match all:  (20, 39) (12000, 20, 39) (4000, 20, 39) (4000, 20, 39)
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 20, 39)]     0           []                               
                                                                                                  
 tf.cast_1 (TFOpLambda)         (None, 20, 39)       0           ['input_8[0][0]']                
                                                                                                  
 masking_6 (Masking)            (None, 20, 39)       0           ['tf.cast_1[0][0]']              
                                                                                                  
 tf.__operators__.getit

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 00019: early stopping


# Print result

In [15]:
def print_result(results):
    
    for result in results:
        
        model_name, x, y_pred, y_true, proba = result[:5]
        
        print(model_name)
        print('accuracy: ',accuracy_score(y_true, y_pred))
        print('precision: ',precision_score(y_true, y_pred))
        print('recall: ',recall_score(y_true, y_pred))
        print('f1: ',f1_score(y_true, y_pred))
        print('\n')

In [16]:
print_result(results)

#print(len(res_nn))

Naive Bayse - org_title org_title
accuracy:  0.9495
precision:  0.9446640316205533
recall:  0.955044955044955
f1:  0.9498261301539991


Naive Bayse - cleaned_words cleaned_words
accuracy:  0.95
precision:  0.9473684210526315
recall:  0.9530469530469531
f1:  0.950199203187251


Naive Bayse - minimal_words minimal_words
accuracy:  0.94175
precision:  0.9367901234567901
recall:  0.9475524475524476
f1:  0.9421405512788676


Naive Bayse - cleaned_pos cleaned_pos
accuracy:  0.944
precision:  0.9489898989898989
recall:  0.9385614385614386
f1:  0.9437468608739327


LSTM org_title
accuracy:  0.92825
precision:  0.9413278435409161
recall:  0.9135864135864136
f1:  0.9272496831432193


LSTM cleaned_words
accuracy:  0.94675
precision:  0.9281953087601723
recall:  0.9685314685314685
f1:  0.9479344903446589


LSTM cleaned_pos
accuracy:  0.97775
precision:  0.9794486215538847
recall:  0.9760239760239761
f1:  0.9777332999749812


LSTM minimal_words
accuracy:  0.92975
precision:  0.9296055916125812
reca

# Organize results into a dataframe

In [78]:

#column_name = ['org_title','cleaned_words','minimal_words','cleaned_pos']+['y_true']+model_name

# model_name, x, y_pred, y_true, proba = result[:5]

df_result = []

for i in range(len(results)):

    df = pd.DataFrame({
        'x': results[i][1].tolist(),
        'y_true': results[i][3],
        'y_pred': results[i][2],
        'proba0': [results[i][4][j][0] for j in range(len(results[i][1])) ],
        'proba1': [results[i][4][j][1] for j in range(len(results[i][1])) ]
    })
    df_result.append(df)


name_y_true = [results[i][0] for i in range(len(results))]
name_proba0 = [results[i][0]+'_p0' for i in range(len(results))]
name_proba1 = [results[i][0]+'_p1' for i in range(len(results))]

names = []
for i in range(len(name_y_true)):
    names.append(name_y_true[i])
    names.append(name_proba0[i])
    names.append(name_proba1[i])

In [88]:
display(df_result[7])

Unnamed: 0,x,y_true,y_pred,proba0,proba1
0,"[323395.0, 239792.0, 337662.0, 307315.0, 61174.0, 274823.0, 296974.0, 323224.0, 152213.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.999987,0.000013
1,"[366138.0, 96372.0, 324820.0, 266870.0, 153136.0, 219178.0, 363117.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,0,0.999343,0.000657
2,"[152927.0, 295690.0, 188287.0, 317365.0, 221826.0, 244015.0, 111940.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.999998,0.000002
3,"[105014.0, 165523.0, 160418.0, 198213.0, 223628.0, 308331.0, 110542.0, 53499.0, 135530.0, 105014.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,1,0.000011,0.999989
4,"[129696.0, 224517.0, 112054.0, 341875.0, 239623.0, 90384.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.999945,0.000055
...,...,...,...,...,...
3995,"[164533.0, 218568.0, 75820.0, 358536.0, 389401.0, 315390.0, 363748.0, 280944.0, 174296.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,1,0.000010,0.999990
3996,"[366138.0, 164934.0, 244641.0, 127455.0, 378066.0, 168566.0, 251645.0, 162051.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,1,0.000008,0.999992
3997,"[383736.0, 146690.0, 108700.0, 171968.0, 326241.0, 268174.0, 379446.0, 300124.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1,1,0.000008,0.999992
3998,"[89824.0, 323224.0, 110194.0, 381091.0, 264281.0, 211025.0, 248403.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0,0,0.999567,0.000433


In [89]:
# x names..., y_true, model1 y_pred, model1 proba0, model1, proba1, model2 y_pred, ...

column_name = ['org_title','cleaned_words','minimal_words','cleaned_pos']+['y_true']+names
print(column_name)

['org_title', 'cleaned_words', 'minimal_words', 'cleaned_pos', 'y_true', 'Naive Bayse - org_title org_title', 'Naive Bayse - org_title org_title_p0', 'Naive Bayse - org_title org_title_p1', 'Naive Bayse - cleaned_words cleaned_words', 'Naive Bayse - cleaned_words cleaned_words_p0', 'Naive Bayse - cleaned_words cleaned_words_p1', 'Naive Bayse - minimal_words minimal_words', 'Naive Bayse - minimal_words minimal_words_p0', 'Naive Bayse - minimal_words minimal_words_p1', 'Naive Bayse - cleaned_pos cleaned_pos', 'Naive Bayse - cleaned_pos cleaned_pos_p0', 'Naive Bayse - cleaned_pos cleaned_pos_p1', 'LSTM org_title', 'LSTM org_title_p0', 'LSTM org_title_p1', 'LSTM cleaned_words', 'LSTM cleaned_words_p0', 'LSTM cleaned_words_p1', 'LSTM cleaned_pos', 'LSTM cleaned_pos_p0', 'LSTM cleaned_pos_p1', 'LSTM minimal_words', 'LSTM minimal_words_p0', 'LSTM minimal_words_p1', 'FNN org_title', 'FNN org_title_p0', 'FNN org_title_p1', 'FNN cleaned_words', 'FNN cleaned_words_p0', 'FNN cleaned_words_p1', 'FN

In [94]:
display(df_result[0])

Unnamed: 0,x,y_true,y_pred,proba0,proba1
0,U.S. seeks meeting soon to revive Asia-Pacific 'Quad' security forum,0,0,1.000000e+00,4.968817e-08
1,"TRUMP CHALLENGES SENATORS ON OBAMACARE: “Frankly, we shouldn’t leave town.” [Video]",1,1,4.479954e-03,9.955200e-01
2,France pushes U.N. to impose sanctions over Libya migrant crisis,0,0,9.999958e-01,4.238415e-06
3,WHY COLLEGE GRADS CAN’T GET JOBS: List Of Most Ridiculous Courses At Some Of America’s Most Elite (Expensive) Colleges,1,1,2.477089e-07,9.999998e-01
4,"Drug lobby, under criticism, starts media campaign",0,0,9.937148e-01,6.285222e-03
...,...,...,...,...,...
3995,This GOP Lawmaker Is A Bigger Threat To Women’s Safety Than The Trans People He Hates,1,1,9.816543e-07,9.999990e-01
3996,Trump Got $6 Million In Donations For Vets – Guess How Much He Gave Them (VIDEO),1,1,1.293993e-10,1.000000e+00
3997,WATCH: Female Cop Halts Sex Offender’s Violent Rampage,1,1,1.081663e-04,9.998918e-01
3998,U.S. calls for U.N. Security Council vote on North Korea on Monday,0,0,9.999667e-01,3.328095e-05


In [124]:
dict_name = []
dict_name.append({'x':'org_title',    'y_pred':'nb_og','proba0':'nb_og_p0','proba1':'nb_og_p1'})
dict_name.append({'x':'cleaned_words','y_pred':'nb_cw','proba0':'nb_cw_p0','proba1':'nb_cw_p1'})
dict_name.append({'x':'minimal_words','y_pred':'nb_mw','proba0':'nb_mw_p0','proba1':'nb_mw_p1'})
dict_name.append({'x':'cleaned_pos',  'y_pred':'nb_ps','proba0':'nb_ps_p0','proba1':'nb_ps_p1'})

dict_name.append({'y_pred':'lstm_og','proba0':'lstm_og_p0','proba1':'lstm_og_p1'})
dict_name.append({'y_pred':'lstm_cw','proba0':'lstm_cw_p0','proba1':'lstm_cw_p1'})
dict_name.append({'y_pred':'lstm_ps','proba0':'lstm_ps_p0','proba1':'lstm_ps_p1'})
dict_name.append({'y_pred':'lstm_mw','proba0':'lstm_mw_p0','proba1':'lstm_mw_p1'})

dict_name.append({'y_pred':'fnn_og','proba0':'fnn_og_p0','proba1':'fnn_og_p1'})
dict_name.append({'y_pred':'fnn_cw','proba0':'fnn_cw_p0','proba1':'fnn_cw_p1'})
dict_name.append({'y_pred':'fnn_mw','proba0':'fnn_mw_p0','proba1':'fnn_mw_p1'})
dict_name.append({'y_pred':'fnn_ps','proba0':'fnn_ps_p0','proba1':'fnn_ps_p1'})

df = []

df.append(df_result[0].rename(columns = dict_name[0]))
for i in range(1,4):
    df.append(df_result[i].drop(['y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))

for i in range(4,12):
    df.append(df_result[i].drop(['x','y_true'], axis=1, inplace=False).rename(columns = dict_name[i]))

In [125]:
df_x = pd.merge(df[0].org_title.to_frame(), df[1].cleaned_words.to_frame(), left_index=True, right_index=True)
df_x = pd.merge(df_x, df[2].minimal_words.to_frame(), left_index=True, right_index=True)
df_x = pd.merge(df_x, df[3].cleaned_pos.to_frame(), left_index=True, right_index=True)

df_y = df[0].y_true.to_frame()

df[0].drop(['y_true','org_title'], axis=1, inplace=True)
df[1].drop(['cleaned_words'], axis=1, inplace=True)
df[2].drop(['minimal_words'], axis=1, inplace=True)
df[3].drop(['cleaned_pos'], axis=1, inplace=True)


df_p = pd.merge(df[0],df[1], left_index=True, right_index=True)
for i in range(2,12):
    
    df_p = pd.merge(df_p,df[i], left_index=True, right_index=True)
    
df_final = pd.merge(df_x, df_y, left_index=True, right_index=True)
df_final = pd.merge(df_final, df_p, left_index=True, right_index=True)


In [129]:
display(df_final.sample(2))

Unnamed: 0,org_title,cleaned_words,minimal_words,cleaned_pos,y_true,nb_og,nb_og_p0,nb_og_p1,nb_cw,nb_cw_p0,...,fnn_og_p1,fnn_cw,fnn_cw_p0,fnn_cw_p1,fnn_mw,fnn_mw_p0,fnn_mw_p1,fnn_ps,fnn_ps_p0,fnn_ps_p1
2055,ANTI-TRUMP VANDALS Hit Dr. Ben Carson’s House…Paint “F*ck Trump” on House [Video],anti trump vandals hit dr . ben carson s house paint _mytag_slans_ trump on house _mytag_parentheses_,anti trump vandal hit ben carson house paint _mytag_slans_ trump house _mytag_parentheses_,JJ NNP NNP NNP NNP NNP NNP NNP NNP IN NNP NN,1,1,5e-06,0.999995,1,1.550607e-07,...,0.984826,1,0.046924,0.953076,1,0.455886,0.544114,1,1.5e-05,0.999985
1231,Trump wins Scottish golf course privacy rights case,trump wins scottish golf course privacy rights case,trump win scottish golf course privacy right case,NNP VBZ JJ NN NN NN NNS NN,0,0,0.990271,0.009729,0,0.9901142,...,0.000701,0,0.993236,0.006764,0,0.946209,0.053791,0,0.999998,2e-06


In [132]:
df_final.to_csv('data/model_compare.csv')