In [1]:
import sys
sys.path.append("..")
import nrc

In [2]:
from numpy import array
from scipy.cluster.vq import vq, kmeans, whiten
import mpl_interactions.ipyplot as iplt
#import matplotlib.pyplot as plt
%matplotlib notebook

In [3]:
lexicon = nrc.load_sentiments("../NRC-VAD-Lexicon-Aug2018Release/NRC-VAD-Lexicon.txt")

In [4]:
ix = []
matrix = []
for k, v in lexicon.items():
    ix.append(k)
    matrix.append(v.vector)
matrix = array(matrix)

In [5]:
matrix

array([[-0.042,  0.212, -0.418],
       [ 0.04 ,  0.272, -0.436],
       [-0.146, -0.02 , -0.126],
       ...,
       [ 0.136, -0.306,  0.018],
       [-0.02 ,  0.04 , -0.076],
       [ 0.02 , -0.358, -0.5  ]])

In [6]:
#whitened = whiten(matrix)
clusters = kmeans(matrix, 5)

In [7]:
vq(matrix, clusters[0])

(array([0, 0, 4, ..., 2, 4, 2], dtype=int32),
 array([0.49895665, 0.60182879, 0.32577764, ..., 0.20060597, 0.22106489,
        0.35953136]))

In [8]:
str(lexicon["dog"])

'Sentiment: dog(valence=0.4079999999999999, arousal=0.0, dominance=-0.18600000000000005)'

In [9]:
str(lexicon["cat"])

'Sentiment: cat(valence=0.3340000000000001, arousal=-0.48, dominance=-0.26)'

### Explore why cat and dog have such vastly different sentiments

* Is this a cultural thing?
* Data scarcity
* What other pairs have similar meanings but vastly different sentiments
* Refer to Bolukbasi(sp) 2019?

In [10]:
str(lexicon["zucchini"])

'Sentiment: zucchini(valence=0.020000000000000018, arousal=-0.358, dominance=-0.5)'

In [11]:
str(lexicon["broccoli"])

'Sentiment: broccoli(valence=-0.062000000000000055, arousal=-0.6, dominance=-0.43400000000000005)'

In [12]:
cut = 10.0
from collections import defaultdict as ddict

In [13]:
vals = ddict(list)
sents = {}
for word, vector in lexicon.items():
    vals[int(vector.angle*cut)/cut].append(word)
    sents[word] = int(vector.angle*cut)/cut
for word in sents:
    vals[sents[word]].sort(key=lambda x: (lexicon[x].valence**2+lexicon[x].arousal**2)**0.5)

In [14]:
# TODO: Draw this as a line with labels

def scale(x):
    return (lexicon[x].angle, vals[sents[x]])

def cone(x, deg=1):
    increment = 1/cut
    angle = sents[x]
    res = []
    for i in range(int(int(deg*cut)/2)):
        res += vals[angle-i*increment]
    return res

In [83]:
lexicon["king"].word

'king'

In [16]:
cone("tgif", deg=2), str(lexicon["tgif"])

(['hoopla',
  'plasticity',
  'tgif',
  'chorus',
  'hairdryer',
  'nascent',
  'ensue',
  'massage',
  'title',
  'amenity',
  'suitable',
  'clothes',
  'strawberry',
  'transparency',
  'crossword',
  'residences',
  'fable',
  'visage',
  'credential',
  'croissant',
  'easygoing',
  'roving',
  'indivisible',
  'treatment',
  'verily',
  'lyre',
  'raspberry',
  'wiener',
  'nestling',
  'propriety',
  'neat',
  'meat',
  'scrabble',
  'nationality',
  'aide',
  'daytime',
  'woodland',
  'engrossed',
  'aversation',
  'unlock',
  'prima',
  'tattletale',
  'motherland',
  'station',
  'grade',
  'second cousin',
  'treatable',
  'cosy',
  'pied',
  'sprinkle',
  'cellular',
  'optic',
  'guesthouse',
  'globe',
  'recognize',
  'goldfish',
  'candlelight',
  'naturalist',
  'sunflower',
  'peacetime',
  'mistletoe',
  'afloat',
  'absorption',
  'chops',
  'marinara'],
 'Sentiment: tgif(valence=0.08400000000000007, arousal=-0.05800000000000005, dominance=-0.128)')

In [97]:
import numpy as np
import math
from matplotlib.colors import to_rgba_array, TABLEAU_COLORS, XKCD_COLORS


with open("../sample_freq.txt") as freq_file:
    freq = [line.strip().split()[1:] for line in freq_file.readlines()]
    freq = {l[0]: float(l[-2]) for l in freq}
    m = 1#max(list(freq.values()))
    freq = {f: v/m for f,v in freq.items()}

def threedplot(points):
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    xs = [p.valence for p in points]
    ys = [p.arousal for p in points]
    zs = [p.dominance for p in points]
    ax.scatter(xs,ys,zs)
    ax.set_xlabel('Valence')
    ax.set_ylabel('Arousal')
    ax.set_zlabel('Dominance')
    plt.show()

def interactive(points):
    #requires ipympl
    def x(ang, spread):
        return [p.valence for p in points if abs(p.angle - ang) < spread]
    def y(x, ang, spread):
        return [p.arousal for p in points if abs(p.angle - ang) < spread]
    def s(x, y, ang, spread):
        return [freq[p.word] for p in points if abs(p.angle - ang) < spread]
    def label(x, y, ang, spread):
        return [p.word for p in points if abs(p.angle - ang) < spread]
    fig = plt.figure()
    ax = fig.add_subplot()
    controls = iplt.scatter(
        x=x, y=y, s=s,label=label, ang=np.arange(-180,180), spread=np.arange(0,360)
    )
    ax.set_xlabel('Valence')
    ax.set_ylabel('Arousal')
    #ax.set_zlabel('Dominance')
    plt.show()

In [98]:
interactive([x for x in lexicon.values() if x.word in freq])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

VBox(children=(HBox(children=(IntSlider(value=0, description='ang', max=359, readout=False), Label(value='-180…

In [50]:
list(lexicon.values())[0].angle

101.20594750740258

In [80]:
freq

{'of': 23557.38,
 'do': 4582.55,
 'they': 4057.71,
 'she': 2427.92,
 'about': 1899.52,
 'there': 1574.61,
 'him': 1214.51,
 'could': 1254.64,
 'no': 933.7,
 'man': 757.84,
 'very': 592.93,
 'work': 902.45,
 'call': 770.36,
 'in': 584.24,
 'as': 637.72,
 'help': 617.08,
 'every': 523.57,
 'yes': 106.37,
 'its': 440.05,
 'off': 402.08,
 'case': 501.73,
 'money': 519.41,
 'today': 441.13,
 'hold': 414.85,
 'anything': 207.96,
 'home': 688.19,
 'percent': 805.39,
 'kid': 334.18,
 'long': 320.83,
 'service': 515.0,
 'however': 240.09,
 'watch': 27.61,
 'kill': 250.48,
 'often': 323.36,
 'news': 365.36,
 'able': 220.84,
 'five': 423.88,
 'buy': 326.32,
 'door': 162.24,
 'consider': 250.66,
 'everyone': 189.55,
 'process': 181.89,
 'build': 325.13,
 'college': 373.57,
 'heart': 167.35,
 'air': 229.11,
 'wrong': 106.75,
 'security': 280.84,
 'action': 204.59,
 'event': 266.93,
 'model': 121.47,
 'couple': 186.67,
 'role': 209.83,
 'finally': 124.97,
 'return': 0.97,
 'road': 314.43,
 'science'