# Distance Matrix JSON

In [99]:
import numpy as np
import pandas as pd
import plotly 
import json
from toolz.curried import *

In [100]:
data = json.load(open('data/die-deutsche-russlandpolitik.json'))

In [101]:
data.keys()

dict_keys(['debates', 'theses', 'topics', 'articles_theses', 'authors', 'organisations', 'votes', 'articles'])

In [102]:
creator_votes = [{'thesis':k, 'author': v['created_by'], 'debate': v['debate'], 'vote': 1} for (k,v) in data['theses'].items()]

In [103]:
df = pd.DataFrame(creator_votes + list(map(lambda v: merge(v, v['date']), data['votes'])))

In [104]:
df['vote'] = pd.to_numeric(df['vote'])
df.head()

Unnamed: 0,author,date,debate,thesis,timezone,timezone_type,vote
0,cfade4e8a5477aa260b8846fa3beabab,,f785b4b4f4e1113a24382ef9d0ee6345,b82aae6db3207fea58e725fb381baf96,,,1
1,99fa90440348034bfa4b786d58330749,,f785b4b4f4e1113a24382ef9d0ee6345,7eeb1e37364c7ae5beba0514ee4db861,,,1
2,27f158e3bcb756ba64dbbcb67b6fd809,,f785b4b4f4e1113a24382ef9d0ee6345,18006d4d67f93a08417570942857a350,,,1
3,dda040919f3f7ae701de7a87f9d2a746,,f785b4b4f4e1113a24382ef9d0ee6345,7ea981184d7a316bc1d2eb416ee699e8,,,1
4,a235c7a82a461a6439e1760e9db185c1,,f785b4b4f4e1113a24382ef9d0ee6345,32ad5c00a9c8525fd519210b211c0f5c,,,1


In [105]:
df.groupby(['author', 'thesis']).count().max()

date             1
debate           2
timezone         1
timezone_type    1
vote             2
dtype: int64

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 7 columns):
author           864 non-null object
date             792 non-null object
debate           864 non-null object
thesis           864 non-null object
timezone         792 non-null object
timezone_type    792 non-null float64
vote             864 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 47.3+ KB


In [107]:
df.drop_duplicates(subset=['author', 'thesis'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 861 entries, 0 to 863
Data columns (total 7 columns):
author           861 non-null object
date             789 non-null object
debate           861 non-null object
thesis           861 non-null object
timezone         789 non-null object
timezone_type    789 non-null float64
vote             861 non-null int64
dtypes: float64(1), int64(1), object(5)
memory usage: 53.8+ KB


## Thesen

In [108]:
df.pivot('author', 'thesis', 'vote')

thesis,02fedbe2d6df4de8d3318b6c4f26dc99,042746ea426239e8c011fc06d2353de6,0762292f1a50b81138e24381c7ca26f8,0a0be10c91218c29bd33efa7a88abcf0,11fb72353fae0da47ec23357f4d22682,16a0aead46b7367fe80d5575ff458250,16e352d95a9b409b279a8a71f2f974f8,18006d4d67f93a08417570942857a350,195f2dc7465127cb5fc266a70b6c943f,1a3c5d2439ba1ada97975c2d285f9e0f,...,d4c61808fee5be0fc0997f58f47ba57c,db9a71ab5287d2322c4f6a71ed9b30bc,ddd3270c73090ac7e210b735002cfdef,dea3028e0fb22a44cff2c64126571c5a,e13cdd33a980f12b957f0de6b112762c,e20f67366884c8503785fd8e6e16f7a1,e8625a32095932c82cb48464d7882d5d,f2c462ed7ed1862bcb19dc23db358f1c,f2f4783c5fbc633194a50ef95fb19c76,fc9b866dc324b2930c53d5225a006e8c
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27f158e3bcb756ba64dbbcb67b6fd809,1.0,,,,0.0,,1.0,1.0,1.0,-1.0,...,,,1.0,1.0,,1.0,0.0,1.0,1.0,1.0
4c86659e2e738b5d1dcc5f24cabc7fd0,,0.0,,1.0,,1.0,,,,0.0,...,0.0,-1.0,,,0.0,0.0,,-1.0,,0.0
4ebe83c3364ff8cec5db9328b5a5d5b8,1.0,1.0,,1.0,1.0,0.0,1.0,1.0,,-1.0,...,1.0,1.0,1.0,,0.0,1.0,1.0,1.0,1.0,0.0
689977e12f0870cf59690975a9150c05,1.0,1.0,,1.0,,1.0,,1.0,,-1.0,...,1.0,1.0,,,0.0,1.0,1.0,1.0,1.0,1.0
767d3dafa632c6667c754c0fac7b7369,,,,,,,,,,,...,,1.0,,,,1.0,,1.0,,
81600ccb2792fd21468a2fe34f1d19eb,,1.0,,-1.0,,1.0,,,,1.0,...,0.0,1.0,,,1.0,0.0,,1.0,,-1.0
85153296c3d0dada0ae0c6c076e81a81,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,...,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0
99fa90440348034bfa4b786d58330749,-1.0,1.0,0.0,-1.0,0.0,1.0,0.0,-1.0,0.0,1.0,...,0.0,1.0,-1.0,,1.0,-1.0,-1.0,0.0,-1.0,-1.0
9ab185c3d6ea3081e01cb5b3601ba1ab,,1.0,,0.0,,1.0,,,,-1.0,...,1.0,1.0,,,0.0,1.0,,1.0,,-1.0
9f14fad9490b5e3319285cb0afc0b357,,,,,,,1.0,,,,...,,,,,,,,,,


In [109]:
# Ein author zu viel!
len(set(data['authors'].keys()))

18

In [110]:
utility_matrix = df.pivot('thesis', 'author', 'vote').fillna(0)

In [111]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
tsne_fit = pd.DataFrame(tsne.fit_transform(utility_matrix), columns=['x','y'])
tsne_fit.head()

Unnamed: 0,x,y
0,21.938767,-36.78686
1,54.973076,67.360115
2,35.321994,4.428292
3,53.214621,97.694654
4,117.701607,36.272411


In [112]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

print(__version__) # requires version >= 1.9.0

init_notebook_mode(connected=True)


1.12.4


In [113]:
text = [data['theses'][id]['text'] for id in utility_matrix.index]

In [114]:
import plotly.graph_objs as go

# Create a trace
trace = go.Scatter(
    x = tsne_fit.x,
    y = tsne_fit.y,
    mode='markers',
    text=text
)

plot_data = [trace]

# Plot and embed in ipython notebook!
iplot(plot_data, filename='basic-scatter')

In [115]:
from sklearn.metrics.pairwise import pairwise_distances
distance_matrix = pd.DataFrame(pairwise_distances(utility_matrix, metric='cosine'), index=utility_matrix.index, columns=utility_matrix.index)
distance_matrix['from'] = distance_matrix.index

In [116]:
dist_df = pd.melt(distance_matrix, id_vars='from', var_name='to', value_name='distance')

In [117]:
pd.DataFrame(data['theses']).T.to_csv('theses.csv',index_label='Id')
dist_df['Weight'] = (dist_df['distance']/(-2)+1)**2
dist_df.rename(columns={'from':'Source','to':'Target'}).to_csv('dist.csv')

In [118]:
dist_dict = {frm:{} for frm in distance_matrix['from']}
def assign(row):
    dist_dict[row['from']][row['to']] = row['distance']
dist_df.apply(assign, axis=1)
dist_dict

{'02fedbe2d6df4de8d3318b6c4f26dc99': {'02fedbe2d6df4de8d3318b6c4f26dc99': 3.3306690738754696e-16,
  '042746ea426239e8c011fc06d2353de6': 0.6150998205402494,
  '0762292f1a50b81138e24381c7ca26f8': 0.5527864045000421,
  '0a0be10c91218c29bd33efa7a88abcf0': 0.44529980377477085,
  '11fb72353fae0da47ec23357f4d22682': 0.3195861825602283,
  '16a0aead46b7367fe80d5575ff458250': 0.683772233983162,
  '16e352d95a9b409b279a8a71f2f974f8': 0.22222222222222243,
  '18006d4d67f93a08417570942857a350': 0.05131670194948623,
  '195f2dc7465127cb5fc266a70b6c943f': 0.18350341907227397,
  '1a3c5d2439ba1ada97975c2d285f9e0f': 1.8320502943378436,
  '20f6449434facb242265d69b02dd8afd': 0.882148869802242,
  '2131b38467ead3f9eaf6d0863a1e63f2': 1.0,
  '2342f1805fe407ea67fc415e52f22ce7': 0.29289321881345254,
  '27a346866b1b31d189c96029ff54320b': 1.2010075630518424,
  '2a588e16f2ad7cfeba2e35654ef2f6c1': 0.554564596812626,
  '2da193b3890ba80c850c7c024f1f8434': 0.7327387580875756,
  '32ad5c00a9c8525fd519210b211c0f5c': 1.36980

In [119]:
with open('data/dist_dict.json', 'w') as f:
  f.write(json.dumps(dist_dict))

In [32]:
distance_matrix = pd.DataFrame(pairwise_distances(utility_matrix, metric='cosine'), index=utility_matrix.index, columns=utility_matrix.index)


In [33]:
distance_matrix.iloc[3,3]

-4.4408920985006262e-16

## Matrix Factorization

In [34]:
from tqdm import tnrange

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in tnrange(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if not np.isnan(R[i][j]):
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if not np.isnan(R[i][j]):
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T


In [37]:
R = [
     [5,3,np.nan,1],
     [4,np.nan,np.nan,1],
     [1,1,np.nan,5],
     [1,np.nan,np.nan,4],
     [np.nan,1,5,4],
    ]

R = np.array(R)

N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)




In [38]:
nR

array([[ 5.0105478 ,  2.89068046,  5.21927889,  0.99780251],
       [ 3.9511667 ,  2.2849746 ,  4.2710645 ,  0.99683835],
       [ 1.09436232,  0.75499245,  4.6492241 ,  4.96331533],
       [ 0.95153438,  0.64755666,  3.78982557,  3.9739419 ],
       [ 2.12706924,  1.32121518,  4.88575215,  4.03416839]])

In [39]:
nP

array([[ 0.40927017,  2.34319339],
       [ 0.41099488,  1.83979837],
       [ 2.08372004,  0.33161155],
       [ 1.66820567,  0.30130168],
       [ 1.69115347,  0.85764445]])

In [40]:
nQ

array([[ 0.19017851,  2.10512433],
       [ 0.17074733,  1.20382666],
       [ 1.93039104,  1.89025261],
       [ 2.38034652,  0.01007074]])

### real matrix

In [82]:
utility_matrix = df.pivot('thesis', 'author', 'vote')

In [73]:
R = np.array(utility_matrix)

N = len(R)
M = len(R[0])
K = 5

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)




In [74]:
data = [
    go.Heatmap(
        z=np.abs(np.nan_to_num(R-nR))
    )
]
iplot(data, filename='basic-heatmap')


In [75]:
np.mean(np.abs(np.nan_to_num(R-nR)))

0.15698569093921358

In [76]:
data = [
    go.Heatmap(
        z=R
    )
]
iplot(data, filename='basic-heatmap')


In [77]:
data = [
    go.Heatmap(
        z=nR
    )
]
iplot(data, filename='basic-heatmap')


In [79]:
data = [
    go.Heatmap(
        z=nP
    )
]
iplot(data, filename='basic-heatmap')


In [80]:
data = [
    go.Heatmap(
        z=nQ
    )
]
iplot(data, filename='basic-heatmap')


### nP als latente Darstellung der Thesen

In [81]:
nP

array([[  6.39591211e-01,   2.39986854e-01,   8.08704502e-01,
          4.38053927e-01,  -4.27957039e-02],
       [  6.20817418e-01,   9.34350972e-01,   3.34119573e-01,
          4.08454857e-01,   1.03028146e+00],
       [  9.20362901e-01,  -4.80256284e-02,   1.17192087e-02,
          1.05745008e+00,   3.11378667e-01],
       [  2.88953082e-01,  -5.72762849e-01,   8.11804231e-01,
          6.81341089e-01,   9.60601249e-02],
       [  8.12632210e-01,   8.06609613e-01,   1.65783958e-01,
          7.56830816e-01,  -1.22527677e-01],
       [  8.97319033e-01,   1.95215578e-01,  -6.67385173e-04,
          6.95171687e-01,   5.31379489e-01],
       [  8.19955375e-01,   2.59813012e-01,   7.41302234e-01,
          1.55355386e-01,   2.25386973e-01],
       [  5.31906119e-01,   2.91207982e-02,   1.01235903e+00,
          2.30491415e-01,   4.10569977e-02],
       [  8.92553741e-01,   4.38158765e-01,   6.31628085e-01,
          3.39725567e-01,   2.12520749e-01],
       [ -4.13920906e-01,  -1.6752849

In [96]:
distance_matrix = pd.DataFrame(pairwise_distances(nP, metric='cosine'), index=utility_matrix.index, columns=utility_matrix.index)
distance_matrix['from'] = distance_matrix.index
dist_df = pd.melt(distance_matrix, id_vars='from', var_name='to', value_name='distance')
dist_dict = {frm:{} for frm in distance_matrix['from']}
def assign(row):
    dist_dict[row['from']][row['to']] = row['distance']
dist_df.apply(assign, axis=1)
dist_dict

{'02fedbe2d6df4de8d3318b6c4f26dc99': {'02fedbe2d6df4de8d3318b6c4f26dc99': 2.220446049250313e-16,
  '042746ea426239e8c011fc06d2353de6': 0.44463749759224624,
  '0762292f1a50b81138e24381c7ca26f8': 0.3708098476220585,
  '0a0be10c91218c29bd33efa7a88abcf0': 0.2992956335525616,
  '11fb72353fae0da47ec23357f4d22682': 0.2557945087272008,
  '16a0aead46b7367fe80d5575ff458250': 0.37973036960817186,
  '16e352d95a9b409b279a8a71f2f974f8': 0.0705105215713896,
  '18006d4d67f93a08417570942857a350': 0.05497859724357512,
  '195f2dc7465127cb5fc266a70b6c943f': 0.07007063368255573,
  '1a3c5d2439ba1ada97975c2d285f9e0f': 1.9539130935356046,
  '20f6449434facb242265d69b02dd8afd': 0.617142806626056,
  '2131b38467ead3f9eaf6d0863a1e63f2': 0.8248357440517224,
  '2342f1805fe407ea67fc415e52f22ce7': 0.13672498313209092,
  '27a346866b1b31d189c96029ff54320b': 1.1961215499453122,
  '2a588e16f2ad7cfeba2e35654ef2f6c1': 0.3581528621021072,
  '2da193b3890ba80c850c7c024f1f8434': 0.6362194642739324,
  '32ad5c00a9c8525fd519210b21

In [97]:
data = [
    go.Heatmap(
        z=np.array(distance_matrix)
    )
]
iplot(data, filename='basic-heatmap')

In [98]:
with open('data/dist_dict.json', 'w') as f:
  f.write(json.dumps(dist_dict))