### Training a prototype neural network for scoring person and job.

In [1]:
import tensorflow as tf
import numpy as np
import random as r
import json
import pickle as p
from matplotlib import pyplot as plt
from urllib import parse as urlparse
from urllib import request as urlreq
from os import path
from collections import defaultdict as dd

%matplotlib inline

In [2]:
# Create person-ids dictionary pickle
names_ids_dict = {}
if not path.exists('people_ids.pickle'):
    with open('persons','r') as names_ids_file:
        for names_ids in names_ids_file.readlines():
            name, ids = names_ids.strip().split('\t')
            names_ids_dict[name] = '/' + ids.replace('.', '/')
    with open('people_ids.pickle', 'wb') as pfile:
        p.dump(names_ids_dict, pfile)
else:
    with open('people_ids.pickle', 'rb') as names_ids_pfile:
        names_ids_dict = p.load(names_ids_pfile)

In [7]:
api_key = open('../.knowledge_graph_api_key').read()
params = {
    'indent': True,
    'key': api_key,
}
service_url = 'https://kgsearch.googleapis.com/v1/entities:search?'
def get_score(ids):
    params['ids'] = ids
    url = service_url + urlparse.urlencode(params)
    with urlreq.urlopen(url) as response:
        data = json.loads(response.read().decode('utf8'))
        info = data['itemListElement'][0]
    return info['resultScore']

In [4]:
names_ids_dict['Alfred Einstein']

'/m/049r7v'

In [5]:
names_ids_dict['Albert Einstein']

'/m/0jcx'

In [24]:
names_scores = {}
with open('profession.train') as labeled_data:
    for sample in labeled_data.readlines():
        name, job, label = sample.strip().split('\t')
        ids = names_ids_dict[name]
        score = get_score(ids)
        names_scores[(name, job)] = (score, label)

In [9]:
names_ids_dict['Barack Obama']

'/m/02mjmr'

In [10]:
names_scores['Barack Obama']

32.389492

In [11]:
names_scores['Albert Einstein']

27.779625

In [13]:
with open('dict_name_cnt.pickle', 'rb') as pfile:
    names_freq = p.load(pfile)

In [14]:
names_freq['Albert Einstein']

13737

In [15]:
names_freq['Barack Obama']

71605

In [16]:
max(names_freq.values())

95935

In [17]:
with open('./profession_w2v.pickle', 'rb') as f:
    profession_w2v = p.load(f)

In [19]:
names_joblist = dd(list) 
with open('./profession.kb', 'r') as f:
    for sample in f.readlines():
        name, job = sample.strip().split('\t')
        names_joblist[name].append(job)

In [20]:
len(names_joblist)

343329

In [22]:
names_joblist['Albert Einstein']

['Author', 'Mathematician', 'Philosopher', 'Teacher', 'Theoretical Physicist']

In [23]:
len(names_scores)

134

In [26]:
profession_w2v['Professor']

array([ 3.06202316,  4.85025024,  0.55623037, -6.35143995, -0.65076351,
        0.64539635,  2.96383715, -0.22256818, -6.10313988,  0.47862935,
       -1.65601993, -1.4736923 ,  2.82660174, -1.09211802,  1.5448705 ,
       -4.38430452, -2.20745111, -1.39417171, -2.69073033,  1.87968457,
        0.34954807,  0.58635795, -2.61929512,  1.3013835 ,  2.09654403,
        2.21423435,  3.87303448,  1.01058924, -1.4853574 , -2.92127323,
        4.62940311,  3.25305891,  6.47758627,  0.61226898,  0.78855312,
        1.84114373, -3.412678  , -1.30692172, -4.30531788, -0.14468956,
        0.2572335 , -3.7247107 , -4.82165909, -0.28476852,  0.87647051,
       -1.42903805,  2.50252151,  1.74918342, -2.67284966, -4.29282236,
        0.3106907 , -0.56299055,  1.01313806, -1.63501036, -2.73197818,
       -4.21006823,  0.36322132,  1.96318483,  3.31779861,  0.3462092 ,
       -1.30998445,  6.12214088,  1.82297206,  1.70711005,  3.6401825 ,
        2.3958292 ,  2.21079516,  0.325555  ,  5.69937563, -3.97

In [27]:
profession_w2v['Theoretical Physicist']

array([-1.96145678,  4.22292376,  2.71874714, -4.09462786,  1.813833  ,
       -0.78951454,  0.32536054, -0.78995562, -5.10077953, -0.59731132,
        1.13132262, -2.97912455,  3.93992805,  1.03806508,  2.83025479,
        0.19098717, -1.91439795, -2.37832451, -3.22264791,  5.40835619,
        0.43746001,  2.71754694,  1.26511693,  1.57078791, -2.09403205,
        0.17733137,  4.02483177,  0.99346387,  1.38087225,  0.82718992,
       -1.13268256, -0.03801873,  4.49029779,  5.41816425,  1.63054037,
        1.50856614, -3.81361961, -1.33620536,  1.50664783,  2.42522001,
        1.52048206, -2.25191522, -1.73066425,  1.92624557, -0.29228342,
        1.9584676 , -0.30635029, -0.09670854,  0.51553339, -3.79655838,
       -1.52897811, -0.35884809,  1.77354503, -3.45378923, -4.95416069,
       -3.46910906,  0.04068673, -2.93274927,  1.04367602, -3.24327683,
        0.97585928,  4.07366133,  1.02184308,  0.33591336,  2.42824984,
        6.0037365 ,  3.65321541, -1.31445396,  3.53790045,  0.77

In [28]:
def similarity(w1, w2):
    v1 = w1 / np.sqrt(sum(i*i for i in w1))
    v2 = w2 / np.sqrt(sum(i*i for i in w2))
    return np.dot(v1, v2)

In [29]:
similarity(profession_w2v['Theoretical Physicist'], profession_w2v['Professor'])

0.59475958

In [31]:
for j in names_joblist['Albert Einstein']:
    print(similarity(profession_w2v['Professor'], profession_w2v[j]))

0.45407
0.576939
0.485823
0.467878
0.59476


In [33]:
0.45407 + 0.576939 + 0.485823 + 0.467878 + 0.59476

2.5794699999999997

In [39]:
names_scores[('Albert Einstein', 'Theoretical Physicist')]

(27.779625, '7')

In [66]:
train_names = r.sample(names_scores.keys(), 412)

In [88]:
names_scores[('Mark Ciardi', 'Film Producer')]

(10.888819, '5')

In [80]:
test_names = list()
for i in names_scores.keys():
    if i not in train_names:
        test_names.append(i)

In [85]:
len(train_names)

412

In [87]:
train_names[1]

('Mark Ciardi', 'Film Producer')

In [91]:
train_data = np.ndarray(shape=(412,2), dtype=np.float32)
train_label = np.ndarray(shape=(412,8), dtype=np.float32)
train_label = np.zeros_like(train_label)

In [93]:
for i, name_job in enumerate(train_names):
    name, job = name_job
    pr_score, score = names_scores[name_job]
    job_sim = 0.0
    for jobs in names_joblist[name]:
        job_sim += similarity(profession_w2v[job], profession_w2v[jobs])
    job_sim /= len(names_joblist[name])
    train_data[i] = [pr_score, job_sim]
    train_label[i][int(score)] = 1

In [94]:
train_names[1]

('Mark Ciardi', 'Film Producer')

In [95]:
train_data[1]

array([ 10.88881874,   0.54017627], dtype=float32)

In [96]:
names_joblist['Mark Ciardi']

['Actor', 'Baseball player', 'Film Producer']

In [97]:
train_label[1]

array([ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.], dtype=float32)

In [99]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [100]:
X, y = train_data, train_label

In [115]:
predictor = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y)

In [116]:
predictor.predict(X[0:20])

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [182]:
x = tf.placeholder(tf.float32, shape=[None, 2])
y_ = tf.placeholder(tf.float32, shape=[None, 8])

In [183]:
W1 = tf.Variable(tf.zeros([2,8]))
b = tf.Variable(tf.zeros([8]))

In [185]:
y = tf.nn.relu(tf.matmul(x, W1) + b)
xentropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
train_step = tf.train.GradientDescentOptimizer(0.005).minimize(xentropy)

In [186]:
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    for i in range(10000):
        sess.run([train_step], feed_dict={x: train_data, y_: train_label})
    result = sess.run([y], feed_dict={x: train_data})[0]

In [132]:
prscores = [i for i,j in train_data]

In [135]:
mprs = max(prscores)

In [136]:
prscores = [i / mprs for i in prscores]

In [137]:
for i, _ in enumerate(train_data):
    train_data[i][0] = prscores[i]

In [140]:
train_data[6]

array([ 0.31779689,  0.7519235 ], dtype=float32)

In [141]:
train_label[6]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.], dtype=float32)

In [148]:
from sklearn.metrics import accuracy_score

In [180]:
result = result[0]

In [152]:
result[1]

array([ 0.4005281 ,  0.51578689,  0.49795324,  0.0718751 ,  0.07115686,
        0.22372749,  0.53581291,  0.97081405], dtype=float32)

In [189]:
result[0:20]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

In [154]:
np.argmax(result[0])

7

In [188]:
[np.argmax(i) for i in result[0:10]]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [181]:
[np.argmax(i) for i in result[0:200]]

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7]

In [157]:
train_data[0:10]

array([[ 0.18870281,  0.71771723],
       [ 0.3361837 ,  0.54017627],
       [ 0.41284445,  0.64171016],
       [ 0.70786339,  0.56475031],
       [ 0.60122073,  0.69736683],
       [ 0.58144784,  0.61243641],
       [ 0.31779689,  0.7519235 ],
       [ 0.50381017,  0.66539717],
       [ 0.68879318,  0.53201813],
       [ 0.65737492,  0.81299627]], dtype=float32)

In [158]:
[np.argmax(i) for i in train_label[0:10]]

[3, 5, 5, 1, 1, 0, 7, 7, 6, 2]

In [167]:
c = 0
for i in train_label:
    if np.argmax(i) == 7:
        c += 1

In [168]:
c

87

In [190]:
train_names[3]

('Ewan McGregor', 'Singer')

In [191]:
names_joblist['Ewan McGregor']

['Actor', 'Screenwriter', 'Singer', 'Television Producer', 'Voice Actor']