In [None]:
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

In [None]:
def hostnames_list(filename):
    hostnames=[]
    with open(filename, 'r') as file:
        for line in file:
            line=line.split()
            hostnames.append(line[1])
    return hostnames

In [None]:
hostnames=hostnames_list('data/webspam-uk2007-set1-1.0/WEBSPAM-UK2007-hostnames.txt')

In [None]:
def read_graph(filename):
    outlinks=[]
    i=0
    with open(filename, 'r') as file:
        size=int(file.readline())
        mat=sparse.lil_matrix((size+1,size+1))
        for line in file:
            line=line.split()
            valid_outlinks=0
            l=len(line)
            if(l>0):
                for outlink in line:
                    outlink=outlink.split(':')
                    j=int(outlink[0])
                    mat[i,j]=1/l
            else:
                mat[i,-1]=1         
            i+=1
    mat[-1,-1]=1
    return mat.tocsr()

In [None]:
def PR_iteration(old_pr,R,n,alpha):
    P=(1-alpha)*R.T #allocations reduced and scipy code is used
    new_pr=alpha/n*np.ones(n)+P.dot(old_pr)
    return new_pr

In [None]:
def compute_PR(alpha,epsilon,R):
    n=R.get_shape()[0]
    x=np.random.rand(n)
    x/=x.sum()
    err=np.inf
    while(err>epsilon):
        x_new=PR_iteration(x,R,n,alpha)
        err=(abs(x_new-x)).sum()
        print(f"Error:{err}")
        x=x_new    
    return x

In [None]:
def slice_matrix(matrix, indices):
    ncols=matrix.get_shape()[0]
    aux_matrix=sparse.lil_matrix((len(indices),ncols))
    for i in range(len(indices)):
        aux_matrix[i,indices[i]]=1
    aux_matrix.tocsr()
    new_matrix=aux_matrix.dot(matrix.dot(aux_matrix.transpose()))
    normalize(new_matrix, norm='l1',axis=1,copy=False)
    return new_matrix

In [None]:
R=read_graph('data/uk-2007-05.hostgraph_weighted_graph.txt')

In [None]:
print(type(R))
PR=compute_PR(0.2,0.01,R)

In [None]:
rank=np.squeeze(np.asarray(PR))
rank=rank[:-1]/rank[:-1].sum()
print(rank)

In [None]:
n_relevant=20000
plt.plot(range(0,len(rank)), rank)
print(np.argmax(rank),hostnames[np.argmax(rank)], np.max(rank))

indices = (-rank).argsort()[:n_relevant]
print(indices[1])
relevant_hostnames=[]
for index in indices:
    relevant_hostnames.append(hostnames[index])
indices=np.append(indices, len(rank))
print(indices[-1])
new_R=slice_matrix(R, indices)
print(type(new_R))

In [None]:
def PRM_iteration(old_prm,R,n,alpha):
    P=(1-alpha)*R.T #allocations reduced and scipy code is used
    new_prm=alpha/n*np.eye(n)+(P.dot(old_prm.T)).T
    return new_prm

In [None]:
def compute_PRM(alpha,epsilon,R):
    n=R.get_shape()[0]
    print(n)
    x=np.random.rand(n,n)
    x/=x.sum()
    err=np.inf
    while(err>epsilon):
        x_new=PRM_iteration(x,R,n,alpha)
        err=(abs(x_new-x)).sum()
        print(f"Error:{err}")
        x=x_new
    return x.sum(axis=0)

In [None]:
PRM=compute_PRM(0.2,0.01,new_R)

In [None]:
PRM

In [None]:
new_rank=np.squeeze(np.asarray(PRM))

In [None]:
new_rank=new_rank[:-1]/new_rank[:-1].sum()

In [None]:
plt.plot(range(0,len(new_rank)), new_rank)
print(relevant_hostnames[np.argmax(new_rank)],np.argmax(new_rank), np.max(new_rank))
print(new_R)
indices = (-new_rank).argsort()[:20]
print
for i in indices:
    print(relevant_hostnames[i])