In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

from keras.regularizers import l1, l2
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend
from keras import optimizers

from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import SMOTE

import random

Using TensorFlow backend.


In [2]:
SEED = 10
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# p = 0.4
# df = pd.concat([pd.read_csv('articles2.csv',skiprows=lambda i: 1>0 and random.random() > p),
#                 pd.read_csv('articles1.csv',skiprows=lambda i: 1>0 and random.random() > p)],
#                 axis=1)

In [4]:
df = pd.read_csv('tweets.csv')

In [5]:
df

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...
...,...,...,...
86455,Republican,RepTomPrice,Check out my op-ed on need for End Executive O...
86456,Republican,RepTomPrice,"Yesterday, Betty &amp; I had a great time lear..."
86457,Republican,RepTomPrice,We are forever grateful for the service and sa...
86458,Republican,RepTomPrice,Happy first day of school @CobbSchools! #CobbB...


In [6]:
reps_to_include = ['PeteSessions', 'EleanorNorton']
bool_mask = df['Handle'].isin(reps_to_include)
df = df[bool_mask][['Tweet','Party']]

In [7]:
df

Unnamed: 0,Tweet,Party
38468,"Today, Senate Democrats vote to #SaveTheIntern...",Democrat
38469,"Big #DCStatehood deal. Thank you, Senator @maz...",Democrat
38470,RT @MayorBowser: Tonight is a big night for DC...,Democrat
38471,Our work for the full $150 million funding for...,Democrat
38472,Thank you @RepStephMurphy for cosponsoring our...,Democrat
...,...,...
77259,This year I proudly supported legislation to t...,Republican
77260,I support the President in his effort to crack...,Republican
77261,We need to invest in our American workers so t...,Republican
77262,Obama's regulatory red tape was strangling our...,Republican


In [44]:
rep = df[df['Party'] == 'Republican']['Tweet'].to_numpy()

In [43]:
dem = df[df['Party'] == 'Democrat']['Tweet'].to_numpy()

In [45]:
dem

array(['Today, Senate Democrats vote to #SaveTheInternet. I’m cosponsoring our similar #NetNeutrality bill here in the Hous… https://t.co/sXJfxGalZl',
       'Big #DCStatehood deal. Thank you, Senator @maziehirono for cosponsoring our #DCStatehood bill, setting a new record… https://t.co/SWiPIMGLE9',
       'RT @MayorBowser: Tonight is a big night for DC as the @Capitals take on Tampa Bay and the @Nationals host the @Yankees. I want fans to focu…',
       'Our work for the full $150 million funding for #Metro safety improvements is bearing fruit in the fiscal year 2019… https://t.co/h0Q4VkkSSI',
       'Thank you @RepStephMurphy for cosponsoring our #DCStatehood bill. Face-to-face contact takes time, but it’s paying… https://t.co/zBzhJk8cya',
       '@StJacki Many thanks for your kind words!',
       "I've had a blast visiting DC communities every week to talk about my work in Congress and hear directly from reside… https://t.co/oOtd6FwtCT",
       'RT @axios: Eleanor Holmes Norton, wh

In [8]:
mask = np.random.rand(len(df)) < 1
trimmed = df[mask]

train_mask = np.random.rand(len(trimmed)) < 0.6
train = trimmed[train_mask]
test = trimmed[~train_mask]

In [9]:
X_train = train['Tweet'].to_numpy()
y_train = train['Party'].to_numpy().reshape(-1, 1)
X_test = test['Tweet'].to_numpy()
y_test = test['Party'].to_numpy().reshape(-1, 1)

In [10]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

In [11]:
y_train = enc.fit_transform(y_train).toarray()[:,0]
y_test = enc.fit_transform(y_test).toarray()[:,0]

In [12]:
y_train[56]

1.0

In [13]:
# Load the encoder:
g = tf.Graph()
with g.as_default():
  text_input = tf.placeholder(dtype=tf.string, shape=[None])
  embed = hub.load("universal-sentence-encoder-large")
  embedded_text = embed(text_input)
  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

# Initialize session:
session = tf.Session(graph=g)
session.run(init_op)

In [58]:
#Function to compute all embeddings for each sentence:
#Be patient, takes a little while:
def similarity_matrix(merge_list):
    #initialize distance array:
    #initialize embeddings array:
    emb_all = np.zeros([len(merge_list),512])
    #Outer for loop:
    for i in range(len(merge_list)):
        #print("iteration: {}/{}".format(i,len(merge_list)))
        #Here is where we run the previously started session, so it is important to run previous step succesfully:
        i_emb = session.run(embedded_text, feed_dict={text_input: [merge_list[i]]})
        emb_all[i,:] = i_emb
    return emb_all

In [21]:
test = similarity_matrix(['@somerandombullshit'])

iteration: 0/1


In [62]:
np.inner(similarity_matrix(['How old are you?']),similarity_matrix(['What is your age?']))

array([[0.90159645]])

In [59]:
def avg_similarity(tweets, total_comparisons=50):
    total = 0
    for i in range(total_comparisons):
        print("iteration: {}/{}".format(i,total_comparisons))
        r1 = random.randint(0,len(tweets)-1)
        r2 = random.randint(0,len(tweets)-1)
        total += np.inner(similarity_matrix([tweets[r1]]),similarity_matrix([tweets[r2]]))[0]
    return total/total_comparisons

In [48]:
dem[4]

'Thank you @RepStephMurphy for cosponsoring our #DCStatehood bill. Face-to-face contact takes time, but it’s paying… https://t.co/zBzhJk8cya'

In [61]:
avg_similarity(rep, total_comparisons=50)

iteration: 0/50
iteration: 1/50
iteration: 2/50
iteration: 3/50
iteration: 4/50
iteration: 5/50
iteration: 6/50
iteration: 7/50
iteration: 8/50
iteration: 9/50
iteration: 10/50
iteration: 11/50
iteration: 12/50
iteration: 13/50
iteration: 14/50
iteration: 15/50
iteration: 16/50
iteration: 17/50
iteration: 18/50
iteration: 19/50
iteration: 20/50
iteration: 21/50
iteration: 22/50
iteration: 23/50
iteration: 24/50
iteration: 25/50
iteration: 26/50
iteration: 27/50
iteration: 28/50
iteration: 29/50
iteration: 30/50
iteration: 31/50
iteration: 32/50
iteration: 33/50
iteration: 34/50
iteration: 35/50
iteration: 36/50
iteration: 37/50
iteration: 38/50
iteration: 39/50
iteration: 40/50
iteration: 41/50
iteration: 42/50
iteration: 43/50
iteration: 44/50
iteration: 45/50
iteration: 46/50
iteration: 47/50
iteration: 48/50
iteration: 49/50


array([0.12328649])

In [25]:
similarity_matrix(['greetings'])

iteration: 0/1


array([[ 1.89223848e-02,  7.35565126e-02, -5.63333519e-02,
         2.88491677e-02,  1.20003708e-02,  1.25953276e-02,
         1.01204906e-02,  3.34908664e-02,  7.59509578e-02,
        -1.84435975e-02, -9.65040084e-03,  2.43603904e-02,
         1.30020920e-02, -2.91855540e-02, -7.25850686e-02,
         4.57460433e-02,  1.66897252e-02, -3.38270366e-02,
        -9.54996236e-03, -5.21804877e-02, -1.57607235e-02,
        -6.45736791e-03,  2.22613267e-03,  1.55689875e-02,
         7.79014314e-03,  2.00849734e-02, -7.66806081e-02,
         9.59388562e-04, -3.90272699e-02,  2.42367685e-02,
         4.46520448e-02,  2.62062028e-02,  1.87463518e-02,
         2.02035885e-02,  3.11092678e-02,  3.40675525e-02,
         6.19391864e-03,  1.92421749e-02,  4.53773439e-02,
        -1.15619890e-01,  2.23414730e-02,  1.35653028e-02,
        -2.47255135e-02,  4.70629670e-02,  4.32261378e-02,
         1.36914058e-03, -6.86444938e-02, -1.20018180e-02,
        -1.52162993e-02, -8.12065974e-02,  3.86861823e-0

In [22]:
test

array([[-4.23958413e-02, -3.52670439e-03,  5.59539236e-02,
         5.33723310e-02, -4.86366712e-02,  9.91447130e-04,
        -2.80942060e-02, -3.00944485e-02, -6.47289976e-02,
         1.90859586e-02, -2.05226298e-02,  2.33973619e-02,
         9.24620107e-02, -2.92882528e-02, -2.58940626e-02,
        -1.06275044e-02, -2.96575930e-02, -6.12816140e-02,
        -5.85036166e-03, -1.14298733e-02, -3.44738103e-02,
         2.80032661e-02,  1.86045468e-02,  3.23580112e-03,
         7.21937269e-02, -5.80071704e-03,  1.08461101e-02,
         5.68645671e-02, -3.17064896e-02,  5.87991327e-02,
         1.17012011e-02,  3.16354968e-02,  4.81585488e-02,
        -2.60713380e-02,  3.44142392e-02,  3.30407657e-02,
         2.74243690e-02,  2.98241023e-02, -3.34925689e-02,
         1.68278832e-02, -2.24541761e-02, -4.41893563e-02,
        -1.47699686e-02,  1.08549101e-02,  2.41870657e-02,
        -6.98053241e-02, -6.66189864e-02,  6.53392542e-03,
         2.88386866e-02, -3.01509015e-02, -1.73982531e-0

In [15]:
E_train = similarity_matrix(X_train)

iteration: 0/232
iteration: 1/232
iteration: 2/232
iteration: 3/232
iteration: 4/232
iteration: 5/232
iteration: 6/232
iteration: 7/232
iteration: 8/232
iteration: 9/232
iteration: 10/232
iteration: 11/232
iteration: 12/232
iteration: 13/232
iteration: 14/232
iteration: 15/232
iteration: 16/232
iteration: 17/232
iteration: 18/232
iteration: 19/232
iteration: 20/232
iteration: 21/232
iteration: 22/232
iteration: 23/232
iteration: 24/232
iteration: 25/232
iteration: 26/232
iteration: 27/232
iteration: 28/232
iteration: 29/232
iteration: 30/232
iteration: 31/232
iteration: 32/232
iteration: 33/232
iteration: 34/232
iteration: 35/232
iteration: 36/232
iteration: 37/232
iteration: 38/232
iteration: 39/232
iteration: 40/232
iteration: 41/232
iteration: 42/232
iteration: 43/232
iteration: 44/232
iteration: 45/232
iteration: 46/232
iteration: 47/232
iteration: 48/232
iteration: 49/232
iteration: 50/232
iteration: 51/232
iteration: 52/232
iteration: 53/232
iteration: 54/232
iteration: 55/232
it

In [16]:
E_test = similarity_matrix(X_test)

iteration: 0/168
iteration: 1/168
iteration: 2/168
iteration: 3/168
iteration: 4/168
iteration: 5/168
iteration: 6/168
iteration: 7/168
iteration: 8/168
iteration: 9/168
iteration: 10/168
iteration: 11/168
iteration: 12/168
iteration: 13/168
iteration: 14/168
iteration: 15/168
iteration: 16/168
iteration: 17/168
iteration: 18/168
iteration: 19/168
iteration: 20/168
iteration: 21/168
iteration: 22/168
iteration: 23/168
iteration: 24/168
iteration: 25/168
iteration: 26/168
iteration: 27/168
iteration: 28/168
iteration: 29/168
iteration: 30/168
iteration: 31/168
iteration: 32/168
iteration: 33/168
iteration: 34/168
iteration: 35/168
iteration: 36/168
iteration: 37/168
iteration: 38/168
iteration: 39/168
iteration: 40/168
iteration: 41/168
iteration: 42/168
iteration: 43/168
iteration: 44/168
iteration: 45/168
iteration: 46/168
iteration: 47/168
iteration: 48/168
iteration: 49/168
iteration: 50/168
iteration: 51/168
iteration: 52/168
iteration: 53/168
iteration: 54/168
iteration: 55/168
it

In [17]:
#Choose optimizer:
optim = optimizers.Adam(lr=0.0005) 

# create NN for news clissification:
news_DNN = Sequential()
news_DNN.add(Dense(40, input_dim=512, activation = 'relu', kernel_regularizer=l2(0.1)))
news_DNN.add(Dropout(0.25))
news_DNN.add(Dense(40, activation = 'relu', kernel_regularizer=l2(0.1)))
news_DNN.add(Dropout(0.25))

# Output layer with multiclass activation function:
news_DNN.add(Dense(1,activation='sigmoid'))

# Compile model:
news_DNN.compile(loss='binary_crossentropy', optimizer=optim, metrics=['acc'])

W0104 19:37:07.777359 140440009021248 deprecation_wrapper.py:119] From /home/marcus/miniconda3/envs/ml/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0104 19:37:07.790257 140440009021248 deprecation_wrapper.py:119] From /home/marcus/miniconda3/envs/ml/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0104 19:37:07.800692 140440009021248 deprecation_wrapper.py:119] From /home/marcus/miniconda3/envs/ml/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0104 19:37:07.866967 140440009021248 deprecation_wrapper.py:119] From /home/marcus/miniconda3/envs/ml/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecate

In [18]:
m_h = news_DNN.fit(E_train, y_train, epochs=50, validation_data=(E_test, y_test), batch_size=128, verbose=1)

Train on 232 samples, validate on 168 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
