<a href="https://colab.research.google.com/github/minthammock/cap-stone/blob/dev/capstone_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [112]:
import tensorflow
import tensorflow.keras as keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from sklearn.model_selection import train_test_split

from gensim.models import Word2Vec
import nltk
from nltk import word_tokenize

import pandas as pd
import numpy as np

import os

# Copy the repo from Github

In [2]:
! git clone https://github.com/minthammock/cap-stone

Cloning into 'cap-stone'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 23 (delta 3), reused 16 (delta 1), pack-reused 0[K
Unpacking objects: 100% (23/23), done.


# Load up and Explore the data

In [3]:
dfTemp = pd.read_json('./cap-stone/politifact_info.json', orient='', )
df = pd.DataFrame()
for column in dfTemp.columns:
  df[column] = dfTemp[column][0]

In [4]:
display(df.head())
display(df.describe())
display(df['truth_value'].value_counts())

Unnamed: 0,author,quote,truth_value
0,Facebook posts,Says the new coronavirus vaccines contain toxi...,pants-fire
1,Bloggers,21% of people are having serious adverse event...,false
2,Ted Cruz,Says that guidance urging vaccinated people to...,pants-fire
3,Facebook posts,A 2018 executive order gives Trump the ability...,pants-fire
4,Facebook posts,President Donald Trump will hold his inaugurat...,pants-fire


Unnamed: 0,author,quote,truth_value
count,18436,18436,18436
unique,4263,18413,9
top,Facebook posts,On changing the rules for filibusters on presi...,false
freq,1051,3,4240


false          4240
half-true      3265
mostly-true    3088
barely-true    3019
true           2348
pants-fire     2218
full-flop       162
half-flip        69
no-flip          27
Name: truth_value, dtype: int64

We aren't attempting to determine whether someone is changing their position. Thus we drop all target rows that are not focused on truth values specifically. 

In [132]:
drop1 = df.loc[df['truth_value'] == 'full-flop'].index
drop2 = df.loc[df['truth_value'] == 'half-flip'].index
drop3 = df.loc[df['truth_value'] == 'no-flip'].index

dfFinal = df.drop(index = drop1.append(drop2).append(drop3))
dfFinal['truth_value'].value_counts()

false          4240
half-true      3265
mostly-true    3088
barely-true    3019
true           2348
pants-fire     2218
Name: truth_value, dtype: int64

# Train Test Split Without Author

In [133]:
X = dfFinal['quote']
y = dfFinal['truth_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1)

In [134]:
vectorizer = TextVectorization(max_tokens=19089  , output_sequence_length=100)
vectorizer.adapt(np.array(X_train))

vocab = vectorizer.get_vocabulary()
vocab_index = dict(zip(vocab, range(len(vocab))))
print('There are {} unique tokens in the dataset.'.format(len(vocab)))
display(vocab[:10])
len(vocab_index)

There are 18960 unique tokens in the dataset.


['', '[UNK]', 'the', 'in', 'of', 'to', 'a', 'and', 'says', 'for']

18960

In [135]:
pathToGlove = 'drive/MyDrive/NLP Open Source Models/Glove/'

glove = {}
with open(pathToGlove+'glove.6B.100d.txt') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        glove[word] = coefs

In [136]:
num_tokens = len(vocab)
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in vocab_index.items():
    embedding_vector = glove.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))
input_dim = embedding_matrix.shape[0]

Converted 15232 words (3728 misses)


In [137]:
y_train_OH = pd.get_dummies(y_train).values
y_test_OH = pd.get_dummies(y_test).values

tokenizer = text.Tokenizer(num_words=18960)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_quotes = tokenizer.texts_to_sequences(X_train)
X_train_tokens = sequence.pad_sequences(list_tokenized_quotes, maxlen=100)
num_tokens = X_train_tokens.shape[0]
print(X_train_tokens.shape)

(16360, 100)


In [156]:
model = keras.models.Model()
input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(input)
x = Embedding(
      input_dim = embedding_matrix.shape[0],
      output_dim = 100,
      embeddings_initializer=keras.initializers.Constant(embedding_matrix),
      trainable=True,
    )(x)
x = LSTM(100, return_sequences=True)(x)
x = GlobalAveragePooling1D()(x)
x = Dense(
      100, 
      activation='relu',
      kernel_regularizer = keras.regularizers.L2()
    )(x)
x = Dense(
      16, 
      activation='relu',
      kernel_regularizer = keras.regularizers.L2()
      )(x)
output = Dense(6, activation='softmax')(x)

model = keras.models.Model(input, output, name = 'glove_model')

model.compile(
    loss='categorical_crossentropy', 
    optimizer=keras.optimizers.RMSprop(momentum=.5), 
    metrics=[
      keras.metrics.CategoricalAccuracy(),
    ]
)

model.summary()

Model: "glove_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_20 (TextV (None, 100)               0         
_________________________________________________________________
embedding_36 (Embedding)     (None, 100, 100)          1896000   
_________________________________________________________________
lstm_30 (LSTM)               (None, 100, 100)          80400     
_________________________________________________________________
global_average_pooling1d_29  (None, 100)               0         
_________________________________________________________________
dense_80 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_81 (Dense)             (None, 16)                

In [157]:
modelHistory = model.fit(
    X_train, 
    y_train_OH, 
    epochs=100, 
    batch_size=64, 
    validation_split=0.1,)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [139]:
def create_model_visuals(model, X_train, y_train, X_test, y_test, batch_size, epochs, train = True,kwargs = {}):
  params = locals()
  def confustion_matrix(y, y_hat, normalize = 'true'):
    fig, ax = plt.subplots(1,1,figsize = (7,6))
    matrix = skm.confusion_matrix(y, y_hat, normalize=normalize,)
    sns.heatmap(matrix, cmap = 'Blues', annot=True, ax = ax)
    ax.set(
      title = 'Confustion Matrix',
      xlabel = 'Predicted Label',
      ylabel = 'True Label'
    )
  if train == True:
    modelHistory = model.fit(
        x = X_train, 
        y = y_train,
        batch_size=batch_size,
        epochs=epochs,
        **kwargs)
    model.evaluate(x = X_test, y = y_test)
    dfModel = pd.DataFrame().from_dict(modelHistory.history)
    fig, ((ax1,ax2),(ax3,ax4),(ax5, ax6)) = plt.subplots(nrows = 3,ncols = 2, figsize = (18,10))
    dfModel.plot(y = ['loss', 'val_loss'],ax = ax1, title = 'Loss Metrics', xlabel = 'Training Generation', ylabel = 'Loss score');
    dfModel.plot(y = ['accuracy', 'val_accuracy'],ax = ax2, title = 'Accuracy',xlabel = 'Training Generation', ylabel = 'Accuracy Percentage');
    dfModel.plot(y = ['auc', 'val_auc'],ax = ax3, title = 'Area Under The Curve',xlabel = 'Training Generation', ylabel = 'Area Under The Curve');
    dfModel.plot(y = ['square_hinge', 'val_square_hinge'],ax = ax4, title = 'Square Hinge',xlabel = 'Training Generation', ylabel = 'Square Hinge');
    dfModel.plot(y = ['val_true_positives'],ax = ax5, title = 'Val True Positives',xlabel = 'Training Generation', ylabel = 'Number of True Positives');
    dfModel.plot(y = [ 'val_true_negatives'],ax = ax6, title = 'Val True Negatives',xlabel = 'Training Generation', ylabel = 'Number of True Negatives');
    plt.tight_layout()
    plt.show()
     
  else:
    dfModel = None
 
  # y_test_hat = np.where(model.predict(X_test, y_test) > .39, 1,0).flatten()
  # y_test = testGenerator.y
  # confustion_matrix(y_test, y_test_hat)
  # dfTest = pd.DataFrame.from_dict(skm.classification_report(y_test, y_test_hat, output_dict=True))
  # display(dfTest)
 
  return dfModel,params

In [127]:
# we define the filepath where any results will be saved
firstModel_filepath = os.path.join('first_model')

# the Callbacks for keras.Model.fit() method. See Keras documentation for more info.
firstModelEarlyStop = EarlyStopping(patience= 10, mode = 'auto', restore_best_weights=False, monitor='val_loss')
firstModelCheckpoint = ModelCheckpoint(firstModel_filepath,save_best_only=True, monitor='val_loss')
firstModelLRAdjust = ReduceLROnPlateau(monitor = 'val_loss', factor = .5, patience=2, min_delta=.00000000001)

# This cell runs the fitting 
firstModelHistory,params = create_model_visuals(
    model = model, 
    X_train=np.array(X_train), 
    y_train = np.array(y_train_OH),
    X_test = np.array(X_train),
    y_test = np.array(y_test_OH),
    epochs = 150, 
    batch_size=64,
    kwargs = {
        'validation_split': .1,
        'callbacks':[firstModelCheckpoint, firstModelEarlyStop, firstModelLRAdjust]})

Epoch 1/150


ValueError: ignored

In [142]:
model.evaluate(X_test, y_test_OH)



[9.821587562561035, 0.2541254162788391]

# Pending stuff

## Baseline Model: Word2Vec Local Training?

In [55]:
model = Word2Vec(data, size=100, window=5, min_count=1, workers=4)

In [56]:
model.train(data, total_examples=model.corpus_count, epochs=10)

(2966813, 3885650)

In [57]:
wv = model.wv

In [62]:
display(wv.most_similar(positive = ["time"]))
display(wv['time'])


[('point', 0.7773004174232483),
 ('ex-president', 0.722303032875061),
 ('term', 0.6774725317955017),
 ('week', 0.674705445766449),
 ('century', 0.6672503352165222),
 ('point-in-time', 0.6590580940246582),
 ('president', 0.6516431570053101),
 ('decade', 0.6503779888153076),
 ('session', 0.6480398178100586),
 ('course', 0.6419240832328796)]

array([ 0.76711375,  0.8935672 , -1.1876911 , -0.3295918 , -0.06705547,
       -1.3076243 , -1.8133975 ,  0.69168943, -0.16471986,  0.71398264,
       -0.24640153, -0.20698534, -0.95611113,  0.25964814,  0.050776  ,
        0.972497  ,  1.6574346 , -0.13140863, -0.02349065,  1.2209971 ,
       -1.508631  , -0.03084175,  1.026226  ,  0.32340807,  0.38479125,
       -0.16057101, -0.3934429 , -0.6635785 , -0.06906135, -1.2750412 ,
       -1.4661082 ,  1.3415856 , -1.1921365 ,  0.7172722 , -0.96247035,
       -0.70197767,  0.70521677, -1.0516729 , -0.91379696,  0.268338  ,
        0.6374075 ,  1.6403053 ,  1.2662332 , -0.8399998 , -1.1651558 ,
       -0.833719  ,  0.8290142 , -1.4032991 ,  1.2890466 , -0.97319454,
        0.8947347 ,  1.0041395 ,  1.3192818 ,  0.7980006 ,  0.96280426,
       -0.72451246,  1.0957555 , -0.6391643 , -2.477614  , -0.30923206,
        2.3254921 , -0.24521753,  0.31153694, -0.97788614,  0.12351059,
       -0.91432196,  0.17733483, -2.1693728 ,  1.8002627 , -0.32


There are 24360 unique tokens in the dataset.


## Glove Model Transer Learning

In [None]:
# class W2vVectorizer(object):
    
#     def __init__(self, w2v):
#         # Takes in a dictionary of words and vectors as input
#         self.w2v = w2v
#         if len(w2v) == 0:
#             self.dimensions = 0
#         else:
#             self.dimensions = len(w2v[next(iter(glove))])
    
#     # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
#     # it can't be used in a scikit-learn pipeline  
#     def fit(self, X, y):
#         return self
            
#     def transform(self, X):
#         return np.array([
#             np.mean([self.w2v[w] for w in words if w in self.w2v]
#                    or [np.zeros(self.dimensions)], axis=0) for words in X])