### A Song of Vice and Higher: Characterizing Presidential Nominees through Game of Thrones

### Import Necessary Libraries

In [1]:
import os             # file system stuff
import json           # digest json
import pandas as pd   # Dataframes
import numpy as np    # arrays and matrix ops
import helper     # Custom helper functions
import gensim 

import psycopg2 as psql
from helper import attribute_comment 
from helper import person_dict


### Connect to the database

In [2]:
# Define path to secret

# secret_path_aws = os.path.join(os.environ['HOME'], 'mia', '.secret', 
#                            'aws_ps_flatiron.json')
secret_path_aws = os.path.join(os.environ['HOME'], '.secret', 
                           'aws_ps_flatiron.json')
secret_path_aws

'/Users/werlindo/.secret/aws_ps_flatiron.json'

In [3]:
aws_keys = helper.get_keys(secret_path_aws)
user = aws_keys['user']
ps = aws_keys['password']
host = aws_keys['host']
db = aws_keys['db_name']

aws_ps_engine = ('postgresql://' + user + ':' + ps + '@' + host + '/' + db)

# Setup PSQL connection
conn = psql.connect(
    database=db,
    user=user,
    password=ps,
    host=host,
    port='5432'
)

In [4]:
get_database = """
    SELECT * FROM comms_sntmnt_190612;
"""
# Instantiate cursor
cur = conn.cursor()
cur.execute(get_database)
comments = pd.DataFrame(cur.fetchall())
comments.columns = [col.name for col in cur.description]
comments.drop('index', axis=1, inplace=True) #superfluous column
comments.drop(['compound', 'neu'], axis=1, inplace=True) 
#compound and neutral are correlated with other sentiment columns

In [5]:
comments.head()

Unnamed: 0,comment,domain,neg,pos
0,> after e4 i thought i'd be so angry if jaime ...,got,0.137,0.122
1,after each had spent so much time talking abou...,got,0.0,0.147
2,"after ep3 leaks came out for ep4-6, everything...",got,0.092,0.182
3,after episode 3 i had nothing by apathy for th...,got,0.0,0.127
4,"after episode 4, there really were a lot of pe...",got,0.16,0.085


In [35]:
com_small = comments.copy()

In [7]:
com_small.head()

Unnamed: 0,comment,domain,neg,pos
0,> after e4 i thought i'd be so angry if jaime ...,got,0.137,0.122
1,after each had spent so much time talking abou...,got,0.0,0.147
2,"after ep3 leaks came out for ep4-6, everything...",got,0.092,0.182
3,after episode 3 i had nothing by apathy for th...,got,0.0,0.127
4,"after episode 4, there really were a lot of pe...",got,0.16,0.085


## Attribute Comment to People

In [10]:
com_small['bran'].value_counts()

0.0    968
1.0     26
2.0      3
4.0      2
3.0      1
Name: bran, dtype: int64

In [11]:
from itertools import chain

In [12]:
vocab = list(chain(*helper.person_dict.values()))

In [13]:
vocab

['bran',
 'brandon stark',
 'jon',
 'jon snow',
 'khaleesi',
 'dany',
 'daenerys',
 'danyris',
 'davos',
 'doran',
 'cersei',
 'tyrion',
 'sansa',
 'arya',
 'stannis',
 'varys',
 'jamie',
 'brienne',
 'samwell',
 'jorah',
 'theon',
 'hound',
 'sandor',
 'littlefinger',
 'baelish',
 'joffrey',
 'mountain',
 'gregor',
 'robb',
 'drogo',
 'melisandre',
 'bronn',
 'gilly',
 'ramsey',
 'ramsay',
 'missandei',
 'gendry',
 'grey worm',
 'ned',
 'eddard',
 'catelyn',
 'torumund',
 'robert',
 'tommen',
 'viserys',
 'margaery',
 'oberon',
 'night king',
 'lyanna mormont',
 'jaqen',
 'hodor',
 'ygritte',
 'mance',
 'senator harris',
 'k. harris',
 'kamala',
 'kamalaharrisforpresident',
 'biden',
 'joe2020',
 'buttigieg',
 'buttigidg',
 'mayor pete',
 'bootijedge',
 'gillibrand',
 'hickenlooper',
 'klobuchar',
 'warren',
 'booker',
 'inslee',
 'castro',
 'julián',
 'julian',
 'gabbard',
 'tulsi',
 'sanders',
 'bernie',
 'feelthebern',
 'de blasio',
 'bullock',
 'gravel',
 'messam',
 "o'rourke",
 '

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
vectorizer = CountVectorizer(vocabulary=vocab)

In [36]:
vectorized_words = vectorizer.transform(com_small['comment'])

In [20]:
vectorizer.vocabulary

['bran',
 'brandon stark',
 'jon',
 'jon snow',
 'khaleesi',
 'dany',
 'daenerys',
 'danyris',
 'davos',
 'doran',
 'cersei',
 'tyrion',
 'sansa',
 'arya',
 'stannis',
 'varys',
 'jamie',
 'brienne',
 'samwell',
 'jorah',
 'theon',
 'hound',
 'sandor',
 'littlefinger',
 'baelish',
 'joffrey',
 'mountain',
 'gregor',
 'robb',
 'drogo',
 'melisandre',
 'bronn',
 'gilly',
 'ramsey',
 'ramsay',
 'missandei',
 'gendry',
 'grey worm',
 'ned',
 'eddard',
 'catelyn',
 'torumund',
 'robert',
 'tommen',
 'viserys',
 'margaery',
 'oberon',
 'night king',
 'lyanna mormont',
 'jaqen',
 'hodor',
 'ygritte',
 'mance',
 'senator harris',
 'k. harris',
 'kamala',
 'kamalaharrisforpresident',
 'biden',
 'joe2020',
 'buttigieg',
 'buttigidg',
 'mayor pete',
 'bootijedge',
 'gillibrand',
 'hickenlooper',
 'klobuchar',
 'warren',
 'booker',
 'inslee',
 'castro',
 'julián',
 'julian',
 'gabbard',
 'tulsi',
 'sanders',
 'bernie',
 'feelthebern',
 'de blasio',
 'bullock',
 'gravel',
 'messam',
 "o'rourke",
 '

In [21]:
import numpy as np

In [24]:
person_mat = np.zeros((len(person_dict), len(vocab))).T

In [25]:
person_mat.shape

(89, 66)

In [26]:
person_ids = {}

for i, name in enumerate(person_dict.keys()):
    person_ids[name] = i
    
person_ids                

{'bran': 0,
 'jon': 1,
 'dany': 2,
 'davos': 3,
 'doran': 4,
 'cersei': 5,
 'tyrion': 6,
 'sansa': 7,
 'arya': 8,
 'stannis': 9,
 'varys': 10,
 'jamie': 11,
 'brienne': 12,
 'samwell': 13,
 'jorah': 14,
 'theon': 15,
 'hound': 16,
 'littlefinger': 17,
 'joffrey': 18,
 'mountain': 19,
 'robb': 20,
 'dragons': 21,
 'melisandre': 22,
 'bronn': 23,
 'gilly': 24,
 'ramsey': 25,
 'missandei': 26,
 'gendry': 27,
 'grey worm': 28,
 'ned': 29,
 'catelyn': 30,
 'torumund': 31,
 'robert': 32,
 'tommen': 33,
 'viserys': 34,
 'margaery': 35,
 'oberon': 36,
 'night_king': 37,
 'lyanna': 38,
 'jaqen': 39,
 'hodor': 40,
 'ygritte': 41,
 'mance': 42,
 'harris': 43,
 'biden': 44,
 'buttigieg': 45,
 'gillibrand': 46,
 'hickenlooper': 47,
 'klobuchar': 48,
 'warren': 49,
 'booker': 50,
 'inslee': 51,
 'castro': 52,
 'gabbard': 53,
 'sanders': 54,
 'de blasio': 55,
 'bullock': 56,
 'gravel': 57,
 'messam': 58,
 "o'rourke": 59,
 'bennet': 60,
 'delaney': 61,
 'moulton': 62,
 'swalwell': 63,
 'williamson': 6

In [28]:
vocab_ids = {}

for i, word in enumerate(vocab):
    vocab_ids[word] = i
    
vocab_ids                

{'bran': 0,
 'brandon stark': 1,
 'jon': 2,
 'jon snow': 3,
 'khaleesi': 4,
 'dany': 5,
 'daenerys': 6,
 'danyris': 7,
 'davos': 8,
 'doran': 9,
 'cersei': 10,
 'tyrion': 11,
 'sansa': 12,
 'arya': 13,
 'stannis': 14,
 'varys': 15,
 'jamie': 16,
 'brienne': 17,
 'samwell': 18,
 'jorah': 19,
 'theon': 20,
 'hound': 21,
 'sandor': 22,
 'littlefinger': 23,
 'baelish': 24,
 'joffrey': 25,
 'mountain': 26,
 'gregor': 27,
 'robb': 28,
 'drogo': 29,
 'melisandre': 30,
 'bronn': 31,
 'gilly': 32,
 'ramsey': 33,
 'ramsay': 34,
 'missandei': 35,
 'gendry': 36,
 'grey worm': 37,
 'ned': 38,
 'eddard': 39,
 'catelyn': 40,
 'torumund': 41,
 'robert': 42,
 'tommen': 43,
 'viserys': 44,
 'margaery': 45,
 'oberon': 46,
 'night king': 47,
 'lyanna mormont': 48,
 'jaqen': 49,
 'hodor': 50,
 'ygritte': 51,
 'mance': 52,
 'senator harris': 53,
 'k. harris': 54,
 'kamala': 55,
 'kamalaharrisforpresident': 56,
 'biden': 57,
 'joe2020': 58,
 'buttigieg': 59,
 'buttigidg': 60,
 'mayor pete': 61,
 'bootijedge'

In [29]:
for name, words in person_dict.items():
    person_id = person_ids[name]
    for word in words:
        vocab_id = vocab_ids[word]
        person_mat[vocab_id, person_id] = 1 

In [30]:
person_mat

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [37]:
mentions = vectorized_words @ person_mat 

In [38]:
mentions.shape

(305899, 66)

In [39]:
mentions

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
persons = list(person_dict.keys())

In [50]:
mentions_df = pd.DataFrame(mentions, columns=persons, index=com_small.index) 

In [51]:
mentions_df.head()

Unnamed: 0,bran,jon,dany,davos,doran,cersei,tyrion,sansa,arya,stannis,...,bullock,gravel,messam,o'rourke,bennet,delaney,moulton,swalwell,williamson,yang
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
com_small.columns

Index(['comment', 'domain', 'neg', 'pos'], dtype='object')

In [55]:
com_small['comment_length'] = com_small['comment'].str.len()

In [56]:
com_small.head()

Unnamed: 0,comment,domain,neg,pos,comment_length
0,> after e4 i thought i'd be so angry if jaime ...,got,0.137,0.122,434
1,after each had spent so much time talking abou...,got,0.0,0.147,196
2,"after ep3 leaks came out for ep4-6, everything...",got,0.092,0.182,262
3,after episode 3 i had nothing by apathy for th...,got,0.0,0.127,87
4,"after episode 4, there really were a lot of pe...",got,0.16,0.085,559


In [57]:
com_use = com_small.drop(['comment'],axis=1).copy()

In [58]:
com_use.head()

Unnamed: 0,domain,neg,pos,comment_length
0,got,0.137,0.122,434
1,got,0.0,0.147,196
2,got,0.092,0.182,262
3,got,0.0,0.127,87
4,got,0.16,0.085,559


In [59]:
snm = com_use.join(mentions_df)

# snm.head()

In [60]:
snm.head()

Unnamed: 0,domain,neg,pos,comment_length,bran,jon,dany,davos,doran,cersei,...,bullock,gravel,messam,o'rourke,bennet,delaney,moulton,swalwell,williamson,yang
0,got,0.137,0.122,434,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,got,0.0,0.147,196,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,got,0.092,0.182,262,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,got,0.0,0.127,87,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,got,0.16,0.085,559,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get positive column

In [61]:
mentions.shape

(305899, 66)

In [63]:
# Transpose the mentions matrix
men_t = mentions.T


In [70]:
# Get the positive sentiment column as an array
features = np.array(snm[['pos','neg','comment_length']])

In [71]:
features.shape

(305899, 3)

In [69]:
blah[0]

array([1.22e-01, 1.37e-01, 4.34e+02])

In [72]:
snm_agg = men_t @ features 

In [76]:
snm_agg.shape

(66, 3)

This gives us a matrix of rows of **persons** by [ **neg**, **pos**, **comment length** ]. But recall this is matrix multiplication, so it gives us the sumproduct. So we need to take the weighted average.

In [143]:
men_t.shape

(66, 305899)

In [118]:
# Get total comments 'hits' by person
men_t_agg = men_t.sum(axis=1)

In [120]:
men_t_agg.shape

(66,)

In [123]:
num_persons = len(persons)

In [124]:
# We need this as an nx1 array
men_t_agg = np.reshape(men_t_agg,(num_persons,1))

In [125]:
men_t_agg.shape

(66, 1)

Now divide the original matrix by 'hits' to get average 'scores'.

In [127]:
snm_avg = np.divide(snm_agg, men_t_agg)

  """Entry point for launching an IPython kernel.


In [130]:
snm_avg[0]

array([9.59878097e-02, 9.64360991e-02, 8.40284048e+02])

This looks right. Now make `persons_df`

In [131]:
persons_df = pd.DataFrame(snm_avg, columns=['neg','pos','comment_length'])

In [134]:
persons_df['person'] = persons

In [137]:
persons_df = persons_df[['person', 'neg', 'pos', 'comment_length']]

In [139]:
persons_df.head()

Unnamed: 0,person,neg,pos,comment_length
0,bran,0.095988,0.096436,840.284048
1,jon,0.106073,0.123129,795.647152
2,dany,0.101256,0.136458,895.267485
3,davos,0.123971,0.091164,667.975728
4,doran,0.089429,0.125,557.964286


In [148]:
persons_df.sort_values(by=['comment_length'], axis=0, ascending=False)

Unnamed: 0,person,neg,pos,comment_length
36,oberon,0.088133,0.170600,1733.733333
53,gabbard,0.107543,0.092400,1340.477692
40,hodor,0.057733,0.058745,932.747700
20,robb,0.098743,0.132463,911.203390
26,missandei,0.105493,0.144823,901.324723
2,dany,0.101256,0.136458,895.267485
21,dragons,0.104152,0.131787,876.966423
5,cersei,0.103662,0.141583,870.116590
12,brienne,0.115310,0.112470,868.582496
7,sansa,0.106759,0.116573,844.236661


In [144]:
from sklearn.cluster import KMeans

In [175]:
k_means = KMeans(n_clusters=15)

In [176]:
persons_cluster = persons_df.set_index(['person'])

In [177]:
persons_cluster.head()

Unnamed: 0_level_0,neg,pos,comment_length
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bran,0.095988,0.096436,840.284048
jon,0.106073,0.123129,795.647152
dany,0.101256,0.136458,895.267485
davos,0.123971,0.091164,667.975728
doran,0.089429,0.125,557.964286


In [178]:
persons_cluster.dropna(axis=0,inplace=True)

In [179]:
k_means.fit(persons_cluster)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [180]:
pred_clusters = k_means.predict(persons_cluster)

In [181]:
pred_clusters

array([12,  6,  1, 13,  0, 14,  6, 12, 11,  3,  6,  3, 14, 13, 11, 11, 10,
        6,  9, 10,  1, 14,  6,  8,  9,  3,  1,  8,  8, 13,  6, 10, 12,  9,
        4, 13,  1,  9,  9, 10,  0, 13, 11,  7,  0,  0, 10, 10,  0,  2,  3,
        5,  5,  7, 11,  8, 10,  7,  0,  7,  3], dtype=int32)

In [182]:
results = persons_cluster.copy()

In [183]:
results

Unnamed: 0_level_0,neg,pos,comment_length
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bran,0.095988,0.096436,840.284048
jon,0.106073,0.123129,795.647152
dany,0.101256,0.136458,895.267485
davos,0.123971,0.091164,667.975728
doran,0.089429,0.125000,557.964286
cersei,0.103662,0.141583,870.116590
tyrion,0.108100,0.118189,810.833209
sansa,0.106759,0.116573,844.236661
arya,0.099616,0.129261,727.926694
stannis,0.107298,0.120178,639.196017


In [184]:
results['cluster'] = pred_clusters

In [186]:
results.sort_values(by=['cluster'], axis=0)

Unnamed: 0_level_0,neg,pos,comment_length,cluster
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
swalwell,0.131739,0.065580,550.782609,0
doran,0.089429,0.125000,557.964286,0
castro,0.113359,0.079673,541.996016,0
biden,0.112114,0.087077,553.482688,0
warren,0.130170,0.066254,552.288563,0
klobuchar,0.099599,0.058253,569.785714,0
robb,0.098743,0.132463,911.203390,1
dany,0.101256,0.136458,895.267485,1
missandei,0.105493,0.144823,901.324723,1
hodor,0.057733,0.058745,932.747700,1


In [188]:
import gensim

In [192]:
# Word2Vec requires that our text have the form of a list
# of 'sentences', where each sentence is itself a list of
# words. How can we put our _Jeopardy!_ clues in that shape?

import string
text = []

for comment in comments['comment']:     
    sentence = comment.translate(str.maketrans('', '', 
                                               string.punctuation)).split(' ')
    
    new_sent = []
    for word in sentence:
        new_sent.append(word.lower())
    
    text.append(new_sent)

In [194]:
len(text)

305899

In [195]:
# Constructing the model is simply a matter of
# instantiating a Word2Vec object.

model = gensim.models.Word2Vec(text, sg=1)

In [196]:
# To train, call 'train()'!

model.train(text, total_examples=model.corpus_count, epochs=model.epochs)

(47019197, 62302935)

In [197]:
model.corpus_total_words

12460587

In [198]:
# The '.wv' attribute stores the word vectors
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1274e6b70>

In [199]:
model.wv['leader']

array([-0.23592009,  0.43021846, -0.46084347, -0.28230402, -0.20270917,
       -0.18361339,  0.11231136, -0.25231358,  0.19040436,  0.21363412,
        0.20902607, -0.01008021, -0.19587407,  0.4777392 , -0.28582084,
        0.33331195,  0.17522778, -0.06343641,  0.12582608,  0.1537637 ,
        0.35820287, -0.38474396,  0.0150812 , -0.08847568, -0.718754  ,
       -0.11083291, -0.24770035,  0.28399137, -0.2700667 ,  0.2190552 ,
        0.34587616, -0.40078643, -0.11770515,  0.3949305 , -0.4598166 ,
        0.0592818 ,  0.15257128, -0.04654181,  0.54289144, -0.2712421 ,
        0.10758522,  0.18058942, -0.16841717,  0.28496942,  0.05552739,
       -0.3289001 ,  0.00905115, -0.08910112,  0.13934204,  0.4961906 ,
        0.03113335,  0.01693224,  0.4366484 , -0.56596404,  0.08748536,
       -0.03127307, -0.09638502, -0.16447088, -0.06506503, -0.09589352,
        0.20336002, -0.14127678, -0.19664907, -0.36131102,  0.2969085 ,
       -0.13516122,  0.02052971,  0.35285133, -0.23775342,  0.55

In [201]:
model.wv.most_similar('dany', topn=100)

[('danny', 0.9212098121643066),
 ('daenerys', 0.9076652526855469),
 ('danaerys', 0.869865894317627),
 ('dani', 0.8096771240234375),
 ('daenarys', 0.7892028093338013),
 ('jon', 0.7734920978546143),
 ('greyworm', 0.7263287305831909),
 ('daeny', 0.7178654670715332),
 ('cersei', 0.7023975849151611),
 ('sansa', 0.7020993232727051),
 ('danys', 0.7010605931282043),
 ('tyrion', 0.6958633661270142),
 ('her\n\njon', 0.6924678087234497),
 ('danerys', 0.6916928887367249),
 ('arya', 0.6909229755401611),
 ('\n\ndany', 0.6840126514434814),
 ('tyron', 0.6821157932281494),
 ('mad\n\ni', 0.6767454147338867),
 ('\n\ndanny', 0.6764668226242065),
 ('\n\njon', 0.6754411458969116),
 ('cercei', 0.6708755493164062),
 ('dany\n\ndany', 0.6678826808929443),
 ('him\n\nand', 0.6671401262283325),
 ('westeros\n\nshe', 0.6641043424606323),
 ('dany\n', 0.6631199717521667),
 ('varys', 0.6606570482254028),
 ('drogon', 0.6586528420448303),
 ('missandei', 0.6550419330596924),
 ('this\n\ndany', 0.6503520011901855),
 ('misan

In [202]:
model.wv.similarity('dany', 'warren')

0.4024186

In [203]:
model.wv.similarity('cersei', 'warren')

0.20193395

In [205]:
model.wv.similarity('jon', 'warren')

0.3802032

In [1]:
model.wv.most_similar(['cersei'], topn=100)

NameError: name 'model' is not defined

-----

# Appendix

Prove to myself how matrix division works.

In [95]:
x = np.matrix([[2,4,6], [4,6,8]])

In [96]:
x

matrix([[2, 4, 6],
        [4, 6, 8]])

In [111]:
y = np.array([2,2]).reshape((2,1))

In [112]:
y.shape

(2, 1)

In [113]:
g = np.divide(x,y)

In [114]:
g

matrix([[1., 2., 3.],
        [2., 3., 4.]])