In [590]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Embedding
from keras.optimizers import SGD
from keras.layers import Input, Dense, Dropout, Bidirectional, SimpleRNN, Reshape
from keras.models import Model, Sequential
from keras import callbacks
import tensorflow as tf
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pandas as pd
import gensim
import random
from gensim.models import Word2Vec
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [591]:
### opening, reading, and closing .txt files ###
demtext = open("C:/Users/nickp/OneDrive/Desktop/PartyPred/Democrat/dem.txt", 'r', encoding = 'utf-8')
reptext = open("C:/Users/nickp/OneDrive/Desktop/PartyPred/Republican/repub.txt", 'r', encoding = 'utf-8')
rawtext = sent_tokenize(demtext.read())
dem_numsents = len(rawtext)
for sent in sent_tokenize(reptext.read()):
    rawtext.append(sent)
rep_numsents = len(rawtext) - dem_numsents

demtext.close()
reptext.close()

In [592]:
### creating training/test splits at 80% training, 20% test ###
## note:  data is later split into training/validation/testing at 60/20/20 ##
labels = []
for i in range(0,dem_numsents):
    labels.append(0)
for i in range(0, rep_numsents):
    labels.append(1)
alldata = np.column_stack((rawtext, labels))
np.random.shuffle(alldata)
cutoff = int(0.8*len(labels))
trainsents = alldata[:cutoff, 0]
trainlabels = alldata[:cutoff, 1]
testsents = alldata[cutoff:,0]
testlabels = alldata[cutoff:, 1]
trainlabels = trainlabels.astype(np.float)
testlabels = testlabels.astype(np.float)

In [593]:
### creating encoder with vocab_size = # tokens ###
vocab_size = 14290
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens = vocab_size)
encoder.adapt(trainsents)
vocab = np.array(encoder.get_vocabulary())

In [594]:
### Simple first-run model using Tensorflow embedding ###
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=64,
        mask_zero=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

opt = tf.keras.optimizers.Adam(learning_rate = 0.0001)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
earlystop = callbacks.EarlyStopping(monitor='val_loss',
                                   mode = 'min',
                                   patience = 5,
                                   restore_best_weights = True)
model.compile(loss=loss,
              optimizer=opt,
              metrics=['accuracy'])

In [595]:
history = model.fit(trainsents, trainlabels, epochs = 10, batch_size = 128, verbose = 1, validation_split = 0.25, callbacks = [earlystop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [596]:
model.evaluate(testsents, testlabels)



[0.5989043712615967, 0.6563740372657776]

In [597]:
### vectorization with Spacy pretrained word embeddings ###
tokenizer = Tokenizer()
tokenizer.fit_on_texts(rawtext)
trainseqs = tokenizer.texts_to_sequences(rawtext)
word_index = tokenizer.word_index
df_text = pd.DataFrame({'rawtext': rawtext})
df_text['trainseqs'] = df_text.rawtext.apply(lambda x:tokenizer.texts_to_sequences([x])[0])
maxlen = 20 #avg sentence length is approx. 17
encoded_sents = pad_sequences(trainseqs, maxlen = maxlen, padding = 'post', truncating = 'post')
df_index_word = pd.Series(tokenizer.index_word)
df_index_word_valid = df_index_word[:]
df_index_word_valid = pd.Series(['place_holder']).append(df_index_word_valid)
df_index_word_valid = df_index_word_valid.reset_index()
df_index_word_valid.columns = ['tokenid','token']
df_index_word_valid['word2vec'] = df_index_word_valid.token.apply(lambda x: nlp(x).vector)
df_index_word_valid['is_oov'] = df_index_word_valid.token.apply(lambda x: nlp(x)[0].is_oov)
df_index_word_valid.at[0,'word2vec'] = np.zeros_like(df_index_word_valid.at[0,'word2vec'])
embed_matrix = np.array([vec for vec in df_index_word_valid.word2vec.values])
embed_dim = embed_matrix.shape[1]
embed_layer = Embedding(input_dim = 12652, output_dim = embed_dim, embeddings_initializer = Constant(embed_matrix), input_length=maxlen, mask_zero = True, trainable = False)
embedded_sents = (embed_layer(encoded_sents)).numpy()
embedded_sents = embedded_sents.tolist()
for i in range(len(embedded_sents)):
    embedded_sents[i].append(labels[i])
embedded_sents = np.array(embedded_sents)

In [598]:
### shuffling and creating train/test splits ###
np.random.shuffle(embedded_sents)
cutoff = int(0.8*len(labels))
train_data = embedded_sents[:cutoff, :20]
test_data = embedded_sents[cutoff:,:20]

train_labels = []
for i in range(cutoff):
    train_labels.append(embedded_sents[i][20])
train_labels = np.array(train_labels)
dupl = train_labels
train_labels = np.column_stack((train_labels, dupl))

for i in range(len(train_labels)):
    if train_labels[i][1] == 0:
        train_labels[i][0] = 1
    if train_labels[i][1] == 1:
        train_labels[i][0] = 0

test_labels = []
for i in range(len(embedded_sents)-cutoff):
    test_labels.append(embedded_sents[i+cutoff][20])
test_labels = np.array(test_labels)
dupl = test_labels
test_labels = np.column_stack((test_labels, dupl))

for i in range(len(test_labels)):
    if test_labels[i][1] == 0:
        test_labels[i][0] = 1
    if test_labels[i][1] == 1:
        test_labels[i][0] = 0

In [599]:
### adjusting data types to create tensors for input and one-hot encoded labels ###
train_sents = []
for k in range(len(train_data)):
    temp = []
    for j in range(20):
        for i in range(96):
            temp.append(train_data[k][j][i]) 
    temp = np.array(temp)
    train_sents.append(temp)
train_sents = np.array(train_sents)

test_sents = []
for k in range(len(test_data)):
    temp = []
    for j in range(20):
        for i in range(96):
            temp.append(test_data[k][j][i]) 
    temp = np.array(temp)
    test_sents.append(temp)
test_sents = np.array(test_sents)
tdata = tf.convert_to_tensor(train_sents)
ldata = tf.convert_to_tensor(train_labels)
evalsents = tf.convert_to_tensor(test_sents)
evallabels = tf.convert_to_tensor(test_labels)

In [601]:
### Creating model ###
model2 = Sequential()
model2.add(Input(shape=(1920,)))
model2.add(Dropout(0.2))
model2.add(Reshape((1,1920)))
model2.add(Bidirectional(tf.keras.layers.LSTM(64)))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(2, activation = 'softmax'))


opt = tf.keras.optimizers.Adam(learning_rate = 0.001)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
earlystop = callbacks.EarlyStopping(monitor='val_loss',
                                   mode = 'min',
                                   patience = 5,
                                   restore_best_weights = True)
model2.compile(loss=loss,
              optimizer=opt,
              metrics=['accuracy'])

In [602]:
history = model2.fit(tdata, ldata, epochs = 100, batch_size = 128, verbose = 1, validation_split = 0.25, callbacks = [earlystop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
