In [1]:
import numpy as np
import pickle

def load_file(file):
    with open(file, 'rb') as f:
        return pickle.load(f)
    
idx_to_word = load_file("idx_to_word.pkl")
word_to_idx = load_file("word_to_idx.pkl")
vectors = np.load("output.npy")

def get_word_vector(word):
    idx = word_to_idx[word]
    return vectors[idx]

def get_word_at_index(index):
    return idx_to_word[index]

In [2]:
import json

with open("final.json") as f:
    data = json.load(f)
    
data

[{'total_count': [{'count': 4, 'year': 2008},
   {'count': 15, 'year': 2014},
   {'count': 13, 'year': 2017},
   {'count': 10, 'year': 2013},
   {'count': 11, 'year': 2012},
   {'count': 15, 'year': 2016},
   {'count': 15, 'year': 2015},
   {'count': 5, 'year': 2009},
   {'count': 5, 'year': 2010},
   {'count': 6, 'year': 2011},
   {'count': 3, 'year': 2007}],
  'variations': [{'n-gram': 'unknown number',
    'years': [{'count': 1, 'year': 2008},
     {'count': 1, 'year': 2014},
     {'count': 1, 'year': 2017},
     {'count': 2, 'year': 2013},
     {'count': 2, 'year': 2012},
     {'count': 1, 'year': 2016}]},
   {'n-gram': 'unknown properties',
    'years': [{'count': 1, 'year': 2014}, {'count': 1, 'year': 2012}]},
   {'n-gram': 'unknown data',
    'years': [{'count': 1, 'year': 2017}, {'count': 1, 'year': 2015}]},
   {'n-gram': 'unknown variables',
    'years': [{'count': 1, 'year': 2013}, {'count': 1, 'year': 2009}]},
   {'n-gram': 'unknown distribution',
    'years': [{'count': 1, 

In [3]:
#add a doc_count key for each unigram

for word in data:
    doc_count = 0
    for count in word['total_count']:
        doc_count += count['count']
    word['doc_count'] = doc_count
    
data

[{'doc_count': 102,
  'total_count': [{'count': 4, 'year': 2008},
   {'count': 15, 'year': 2014},
   {'count': 13, 'year': 2017},
   {'count': 10, 'year': 2013},
   {'count': 11, 'year': 2012},
   {'count': 15, 'year': 2016},
   {'count': 15, 'year': 2015},
   {'count': 5, 'year': 2009},
   {'count': 5, 'year': 2010},
   {'count': 6, 'year': 2011},
   {'count': 3, 'year': 2007}],
  'variations': [{'n-gram': 'unknown number',
    'years': [{'count': 1, 'year': 2008},
     {'count': 1, 'year': 2014},
     {'count': 1, 'year': 2017},
     {'count': 2, 'year': 2013},
     {'count': 2, 'year': 2012},
     {'count': 1, 'year': 2016}]},
   {'n-gram': 'unknown properties',
    'years': [{'count': 1, 'year': 2014}, {'count': 1, 'year': 2012}]},
   {'n-gram': 'unknown data',
    'years': [{'count': 1, 'year': 2017}, {'count': 1, 'year': 2015}]},
   {'n-gram': 'unknown variables',
    'years': [{'count': 1, 'year': 2013}, {'count': 1, 'year': 2009}]},
   {'n-gram': 'unknown distribution',
    'ye

In [4]:
#sort data list from highest doc_count to lowest

sorted_data = sorted(data, key = lambda k:k['doc_count'], reverse = True)
for uni in sorted_data:
    uni['total_count'].sort(key = lambda k:k['year'])

sorted_data

[{'doc_count': 2008,
  'total_count': [{'count': 36, 'year': 2007},
   {'count': 121, 'year': 2008},
   {'count': 158, 'year': 2009},
   {'count': 149, 'year': 2010},
   {'count': 149, 'year': 2011},
   {'count': 182, 'year': 2012},
   {'count': 164, 'year': 2013},
   {'count': 191, 'year': 2014},
   {'count': 186, 'year': 2015},
   {'count': 276, 'year': 2016},
   {'count': 396, 'year': 2017}],
  'variations': [{'n-gram': 'sequence learning',
    'years': [{'count': 1, 'year': 2008},
     {'count': 2, 'year': 2014},
     {'count': 1, 'year': 2011},
     {'count': 1, 'year': 2017}]},
   {'n-gram': 'consider learning',
    'years': [{'count': 1, 'year': 2008},
     {'count': 1, 'year': 2012},
     {'count': 1, 'year': 2017},
     {'count': 1, 'year': 2016}]},
   {'n-gram': 'optimal reinforcement learning',
    'years': [{'count': 1, 'year': 2014}, {'count': 1, 'year': 2011}]},
   {'n-gram': 'machine learning techniques',
    'years': [{'count': 1, 'year': 2014}, {'count': 2, 'year': 201

In [5]:
with open('raw_count.json', 'w+') as f:
    json.dump(sorted_data, f, indent=4)

In [6]:
#normalize the total_count for unigrams

for uni in sorted_data:
    for count in uni['total_count']:
        count['count'] = count['count']/uni['doc_count']

In [7]:
sorted_data

[{'doc_count': 2008,
  'total_count': [{'count': 0.017928286852589643, 'year': 2007},
   {'count': 0.0602589641434263, 'year': 2008},
   {'count': 0.07868525896414343, 'year': 2009},
   {'count': 0.07420318725099602, 'year': 2010},
   {'count': 0.07420318725099602, 'year': 2011},
   {'count': 0.09063745019920319, 'year': 2012},
   {'count': 0.08167330677290836, 'year': 2013},
   {'count': 0.0951195219123506, 'year': 2014},
   {'count': 0.09262948207171315, 'year': 2015},
   {'count': 0.13745019920318724, 'year': 2016},
   {'count': 0.19721115537848605, 'year': 2017}],
  'variations': [{'n-gram': 'sequence learning',
    'years': [{'count': 1, 'year': 2008},
     {'count': 2, 'year': 2014},
     {'count': 1, 'year': 2011},
     {'count': 1, 'year': 2017}]},
   {'n-gram': 'consider learning',
    'years': [{'count': 1, 'year': 2008},
     {'count': 1, 'year': 2012},
     {'count': 1, 'year': 2017},
     {'count': 1, 'year': 2016}]},
   {'n-gram': 'optimal reinforcement learning',
    'ye

In [28]:
#get vectors of proportions by year for each unigram
vectors = []

for uni in sorted_data:
    curr_yr = 2007
    ptr = 0
    vector = []
    while ptr < len(uni['total_count']):
        if uni['total_count'][ptr]['year'] == curr_yr:
            vector.append(uni['total_count'][ptr]['count'])
            ptr += 1
            curr_yr += 1
        else:
            vector.append(0)
            curr_yr += 1
    if len(vector) == 10:
        vector.append(0) #for unigrams without year 2017
    vectors.append(vector)
vectors

[[0.017928286852589643,
  0.0602589641434263,
  0.07868525896414343,
  0.07420318725099602,
  0.07420318725099602,
  0.09063745019920319,
  0.08167330677290836,
  0.0951195219123506,
  0.09262948207171315,
  0.13745019920318724,
  0.19721115537848605],
 [0.024634334103156273,
  0.050808314087759814,
  0.054657428791377985,
  0.07467282525019246,
  0.0731331793687452,
  0.07544264819091609,
  0.10623556581986143,
  0.13394919168591224,
  0.1031562740569669,
  0.14395688991531946,
  0.15935334872979215],
 [0.020100502512562814,
  0.038525963149078725,
  0.048576214405360134,
  0.06867671691792294,
  0.08710217755443886,
  0.08375209380234507,
  0.09212730318257957,
  0.10552763819095477,
  0.10385259631490787,
  0.135678391959799,
  0.21608040201005024],
 [0.01680672268907563,
  0.05042016806722689,
  0.06218487394957983,
  0.08235294117647059,
  0.07058823529411765,
  0.09915966386554621,
  0.1092436974789916,
  0.10252100840336134,
  0.11764705882352941,
  0.14453781512605043,
  0.1445

In [33]:
from sklearn.manifold import TSNE
import numpy as np

In [34]:
vectors_array = np.array(vectors)
vectors_array

array([[0.01792829, 0.06025896, 0.07868526, ..., 0.09262948, 0.1374502 ,
        0.19721116],
       [0.02463433, 0.05080831, 0.05465743, ..., 0.10315627, 0.14395689,
        0.15935335],
       [0.0201005 , 0.03852596, 0.04857621, ..., 0.1038526 , 0.13567839,
        0.2160804 ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.16666667,
        0.16666667],
       [0.        , 0.        , 0.        , ..., 0.16666667, 0.        ,
        0.16666667],
       [0.25      , 0.        , 0.        , ..., 0.        , 0.25      ,
        0.25      ]])

In [36]:
embedded = TSNE(n_components=1).fit_transform(vectors_array)
embedded.shape

(212, 1)

In [37]:
embedded

array([[ 0.43917   ],
       [ 1.0122288 ],
       [ 1.0062073 ],
       [ 0.68064415],
       [ 1.4073809 ],
       [ 0.64969915],
       [ 3.2405434 ],
       [ 0.98918885],
       [ 1.4523479 ],
       [ 0.99057305],
       [ 1.4469743 ],
       [ 0.59550387],
       [ 0.2329856 ],
       [ 1.7412901 ],
       [-0.27058586],
       [-0.36856857],
       [ 2.2784061 ],
       [-1.5056518 ],
       [ 1.1880528 ],
       [-4.637502  ],
       [ 1.1476662 ],
       [-0.9879633 ],
       [ 1.224997  ],
       [-0.640774  ],
       [ 3.683114  ],
       [-0.7580635 ],
       [ 3.547471  ],
       [ 1.8150063 ],
       [-0.9003174 ],
       [-0.03651755],
       [ 1.288175  ],
       [ 0.3434171 ],
       [ 0.6674806 ],
       [-0.3839465 ],
       [ 0.80984545],
       [ 0.11835028],
       [ 0.81183046],
       [-0.7975738 ],
       [-1.517722  ],
       [ 1.1791593 ],
       [ 3.3334887 ],
       [-2.0738432 ],
       [-1.7235802 ],
       [ 3.7640326 ],
       [ 0.8714774 ],
       [ 3

In [38]:
len(embedded)

212

In [39]:
for i in range(len(embedded)):
    sorted_data[i]['tsne'] = embedded[i][0]
    
sorted_data

[{'doc_count': 2008,
  'total_count': [{'count': 0.017928286852589643, 'year': 2007},
   {'count': 0.0602589641434263, 'year': 2008},
   {'count': 0.07868525896414343, 'year': 2009},
   {'count': 0.07420318725099602, 'year': 2010},
   {'count': 0.07420318725099602, 'year': 2011},
   {'count': 0.09063745019920319, 'year': 2012},
   {'count': 0.08167330677290836, 'year': 2013},
   {'count': 0.0951195219123506, 'year': 2014},
   {'count': 0.09262948207171315, 'year': 2015},
   {'count': 0.13745019920318724, 'year': 2016},
   {'count': 0.19721115537848605, 'year': 2017}],
  'tsne': 0.43917,
  'variations': [{'n-gram': 'sequence learning',
    'years': [{'count': 1, 'year': 2008},
     {'count': 2, 'year': 2014},
     {'count': 1, 'year': 2011},
     {'count': 1, 'year': 2017}]},
   {'n-gram': 'consider learning',
    'years': [{'count': 1, 'year': 2008},
     {'count': 1, 'year': 2012},
     {'count': 1, 'year': 2017},
     {'count': 1, 'year': 2016}]},
   {'n-gram': 'optimal reinforcement

In [40]:
sorted_data.sort(key = lambda k:k['tsne'])

In [41]:
sorted_data

[{'doc_count': 7,
  'total_count': [{'count': 0.14285714285714285, 'year': 2010},
   {'count': 0.14285714285714285, 'year': 2011},
   {'count': 0.14285714285714285, 'year': 2012},
   {'count': 0.2857142857142857, 'year': 2014},
   {'count': 0.14285714285714285, 'year': 2015},
   {'count': 0.14285714285714285, 'year': 2017}],
  'tsne': -5.521504,
  'variations': [{'n-gram': 'balanced',
    'years': [{'count': 1, 'year': 2015},
     {'count': 1, 'year': 2014},
     {'count': 1, 'year': 2011},
     {'count': 1, 'year': 2010},
     {'count': 1, 'year': 2017}]},
   {'n-gram': 'balanced regime',
    'years': [{'count': 1, 'year': 2014}, {'count': 1, 'year': 2012}]}],
  'word': 'balanced'},
 {'doc_count': 20,
  'total_count': [{'count': 0.2, 'year': 2011},
   {'count': 0.15, 'year': 2012},
   {'count': 0.05, 'year': 2013},
   {'count': 0.2, 'year': 2014},
   {'count': 0.1, 'year': 2015},
   {'count': 0.1, 'year': 2016},
   {'count': 0.2, 'year': 2017}],
  'tsne': -5.509384,
  'variations': [{

In [42]:
for uni in sorted_data:
    print(uni['word'])

balanced
crowdsourcing
hilbert
sgd
adversarial
deep
interpretable
implicit
3d
forest
message
passing
pca
mixing
invariance
motor
coordinate
choice
walk
newton
poisson
hierarchical
weighted
hashing
sensing
tree
topic
infinite
compressed
manifold
permutation
active
measurement
neighbor
ranking
feature
partitioning
annotation
rational
dictionary
structural
object
segmentation
functional
coding
group
probabilistic
bayesian
regularization
human
parametric
multiple
additive
identifying
margin
short
coupled
sparse
detection
kernel
construction
matching
supervised
unlabeled
measure
process
strategy
evaluation
joint
regression
metric
sparsity
oracle
action
assumption
estimating
variable
latent
novel
classifier
structure
structured
domain
lasso
subspace
dimensional
modeling
driven
prior
aggregation
robustness
case
return
cascade
graph
learning
asynchronous
reduction
maximum
task
slow
multi
online
large
solving
decision
adaptation
minimization
noise
rank
effect
context
high
distribution
performan

In [45]:
for uni in sorted_data:
    uni.pop('tsne', None)

with open('proportions.json', 'w+') as f:
    json.dump(sorted_data, f, indent=4)