In [1]:
import numpy as np

In [2]:
B = 0.7
M = np.array([[0.0, 0.0, 0.0],
              [0.5, 0.0, 0.0],
              [0.5, 1.0, 1.0]])

In [3]:
def page_rank(M, tax, n_iter=100, E=None):
    """
    Parameters
    ---------
        M : array of shape=(n, n), connectivity matrix.
        tax : amound of tax, for example 0.8
        E : epsilon, min difference between previous and current rank vector to proceed the next iteration
    """
    r = np.ones(shape=M.shape[0]) / M.shape[0]
    r_new = np.zeros_like(r)
    e = np.ones_like(r) / M.shape[0]
    
    for i in range(n_iter):
        r_new = tax * M@r + (1-tax) * e
        print(f'After iter {i+1}: {r}')
        
        # check convergance
        if E is not None:
            if len(np.nonzero(r - r_new < E)[0]) == len(r):
                print(f'Converged after {i+1} iterations')
                break
        
        r = r_new
        
    return r

In [4]:
r_res = page_rank(M, tax=1.0, n_iter=100)

After iter 1: [0.33333333 0.33333333 0.33333333]
After iter 2: [0.         0.16666667 0.83333333]
After iter 3: [0. 0. 1.]
After iter 4: [0. 0. 1.]
After iter 5: [0. 0. 1.]
After iter 6: [0. 0. 1.]
After iter 7: [0. 0. 1.]
After iter 8: [0. 0. 1.]
After iter 9: [0. 0. 1.]
After iter 10: [0. 0. 1.]
After iter 11: [0. 0. 1.]
After iter 12: [0. 0. 1.]
After iter 13: [0. 0. 1.]
After iter 14: [0. 0. 1.]
After iter 15: [0. 0. 1.]
After iter 16: [0. 0. 1.]
After iter 17: [0. 0. 1.]
After iter 18: [0. 0. 1.]
After iter 19: [0. 0. 1.]
After iter 20: [0. 0. 1.]
After iter 21: [0. 0. 1.]
After iter 22: [0. 0. 1.]
After iter 23: [0. 0. 1.]
After iter 24: [0. 0. 1.]
After iter 25: [0. 0. 1.]
After iter 26: [0. 0. 1.]
After iter 27: [0. 0. 1.]
After iter 28: [0. 0. 1.]
After iter 29: [0. 0. 1.]
After iter 30: [0. 0. 1.]
After iter 31: [0. 0. 1.]
After iter 32: [0. 0. 1.]
After iter 33: [0. 0. 1.]
After iter 34: [0. 0. 1.]
After iter 35: [0. 0. 1.]
After iter 36: [0. 0. 1.]
After iter 37: [0. 0. 1.]

In [5]:
a, b, c = r_res[0], r_res[1], r_res[2]

In [6]:
print(b, 0.575*a + 0.15*c)
print(0.95*a, 0.9*c + 0.05*b)
print(0.85*c, b + 0.575*a)
print(a, c + 0.15*b)

0.0 0.14999999999999997
0.0 0.8999999999999999
0.8499999999999999 0.0
0.0 0.9999999999999999


In [7]:
def ts_page_rank(M, tax, S=None, n_iter=1, E=None):
    """
    Parameters
    ---------
        M : array of shape=(n, n), connectivity matrix.
        tax : amound of tax, 
            that is the probability of random walking. 1-tax is the probability that it jumps to some node in S.
        S : array of shape=(n,), with ones indicating nodes in topic set.
        n_iter : int, number of iterations
        E : epsilon, min difference between previous and current rank vector to proceed the next iteration
    """
    r = np.ones(shape=M.shape[0]) / M.shape[0]
    r_new = np.zeros_like(r)
    
    if S is None:
        S = np.ones(shape=M.shape[0])
    S = S / np.sum(S)
    
    for i in range(n_iter):
        r_new = tax * M@r + (1-tax) * S
        print(f'After iter {i+1}: {r_new}')
        
        # check convergance
        if E is not None:
            if len(np.nonzero(r - r_new < E)[0]) == len(r):
                print(f'Converged after {i+1} iterations')
                break
        
        r = r_new
        
    return r

In [8]:
r_res = ts_page_rank(M, tax=0.85, n_iter=10, E=0.001)

After iter 1: [0.05       0.19166667 0.75833333]
After iter 2: [0.05    0.07125 0.87875]
After iter 3: [0.05    0.07125 0.87875]
Converged after 3 iterations


In [9]:
def hubs_authorities(M, tax, n_iter=1, E=None):
    n = M.shape[0]
    a = np.ones(shape=n)
    h = np.ones(shape=n)
    e = np.ones(shape=n) / n
    # initialize
    a = a / np.sqrt(n)
    h = h / np.sqrt(n)
    A = tax * M + (1-tax) * e
    
    for i in range(n_iter):
        # calc new values
        h_new = A@a
        a_new = A.T@h
        
        # normalize
        h_new = h_new / np.sum(h_new)
        a_new = a_new / np.sum(a_new)
         
        print(f'After iter {i+1}: hub scores: {h_new}, auth scores: {a_new}')
        
        # check convergance
        if E is not None:
            if len(np.nonzero((h - h_new)**2 < E)[0]) == len(h) and len(np.nonzero((a - a_new)**2 < E)[0]) == len(a):
                print(f'Converged after {i+1} iterations')
                break
        
        h = h_new
        a = a_new
        
    return h, a
    

In [10]:
tax = 1.0
M = np.array([[1.0, 1.0, 1.0],
              [1.0, 0.0, 1.0],
              [0.0, 1.0, 0.0]])

hubs_authorities(M, tax=tax, n_iter=3, E=0.01)

After iter 1: hub scores: [0.5        0.33333333 0.16666667], auth scores: [0.33333333 0.33333333 0.33333333]
After iter 2: hub scores: [0.5        0.33333333 0.16666667], auth scores: [0.35714286 0.28571429 0.35714286]
Converged after 2 iterations


(array([0.5       , 0.33333333, 0.16666667]),
 array([0.33333333, 0.33333333, 0.33333333]))

# Pearson

In [11]:
import numpy as np
from scipy.spatial.distance import cosine

def pearson(u, v):
    non_missing_u = np.nonzero(u > 0)[0]
    non_missing_v = np.nonzero(v > 0)[0]
    
    avg_u = np.sum(u) / len(non_missing_u)
    avg_v = np.sum(v) / len(non_missing_v)
    print(f'avg_u: {avg_u}')
    print(f'avg_v: {avg_v}')
    
    u[non_missing_u] = u[non_missing_u] - avg_u
    v[non_missing_v] = v[non_missing_v] - avg_v
    print(f'u: {u}')
    print(f'v: {v}')
    return 1 - cosine(u, v)

In [12]:
a = np.array([4.0, 3.0, 4.0, 0.0, 0.0, 1.0])
b = np.array([4.0, 1.0, 2.0, 2.0, 0.0, 1.0])
pearson(a, b)

avg_u: 3.0
avg_v: 2.0
u: [ 1.  0.  1.  0.  0. -2.]
v: [ 2. -1.  0.  0.  0. -1.]


0.6666666666666666

In [13]:
a = np.array([4.0, 3.0, 4.0, 0.0, 0.0, 1.0])
c = np.array([4.0, 0.0, 0.0, 3.0, 1.0, 0.0])
pearson(a, c)

avg_u: 3.0
avg_v: 2.6666666666666665
u: [ 1.  0.  1.  0.  0. -2.]
v: [ 1.33333333  0.          0.          0.33333333 -1.66666667  0.        ]


0.25197631533948495

# Text mining

TF

In [14]:
TF = np.array([[1, 0, 1, 0, 1, 0, 1, 0, 0],
               [1, 0, 0, 0, 1, 1, 1, 1, 0],
               [0, 0, 0, 0, 1, 1, 1, 0, 1],
               [1, 1, 0, 1, 1, 0, 0, 0, 1]])

q = np.array([0, 0, 0, 0, 0, 1, 1, 0, 0])

In [15]:
def tf_cos(TF, q):
    for i in range(len(TF)):
        print(1 - cosine(TF[i], q))

In [16]:
tf_cos(TF, q)

0.35355339059327373
0.6324555320336758
0.7071067811865476
0.0


TF-IDF

In [28]:
import numpy as np

N = TF.shape[0]
n = np.array([np.sum(TF[:, i]) for i in range(TF.shape[1])])

TF_IDF = np.ones_like(TF, dtype=np.float32)

for i in range(TF.shape[0]):
    for j in range(TF.shape[1]):
        
        tf = TF[i, j]
        idf = np.log(N / n[j])
        
        TF_IDF[i, j]= round(tf * idf, 3)

        
TF_IDF

array([[0.288, 0.   , 1.386, 0.   , 0.   , 0.   , 0.288, 0.   , 0.   ],
       [0.288, 0.   , 0.   , 0.   , 0.   , 0.693, 0.288, 1.386, 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.693, 0.288, 0.   , 0.693],
       [0.288, 1.386, 0.   , 1.386, 0.   , 0.   , 0.   , 0.   , 0.693]],
      dtype=float32)

In [42]:
q_tfidf = np.array([q[i] * np.log(N / n[i]) for i in range(len(q))], dtype=np.float32)
q_tfidf

array([0.       , 0.       , 0.       , 0.       , 0.       , 0.6931472,
       0.2876821, 0.       , 0.       ], dtype=float32)

In [45]:
def tfidf_cos(TF_IDF, q_tfidf):

    for i in range(len(TF_IDF)):
        
        print(1 - cosine(TF_IDF[i], q_tfidf))

In [46]:
tfidf_cos(TF_IDF, q_tfidf)

0.07642218470573425
0.46838653087615967
0.7346736192703247
0.0
