In [1]:
import urllib.request
import io
import gzip


for file in ['web-NotreDame.txt']:
    print ('Downloading compressed image of', file)
    source = urllib.request.urlopen(" https://snap.stanford.edu/data/" + file + ".gz")
    compressedFile = io.BytesIO(source.read())
    decompressedFile = gzip.GzipFile(fileobj=compressedFile)

    with open(file, 'wb') as outfile:
        outfile.write(decompressedFile.read())
        outfile.close()
        print ('Saved', file)

Downloading compressed image of web-NotreDame.txt
Saved web-NotreDame.txt


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("web-NotreDame.txt",skiprows=[0,1,2],sep='\t')
df.columns = [col.strip(' #') for col in df.columns]
df = df[df['FromNodeId']<10000]
df = df[df['ToNodeId']<10000]
df1 = pd.DataFrame(df.groupby(['FromNodeId'])['ToNodeId'].count()).reset_index().rename(columns={'ToNodeId':'count'})[:10000]
df2 = pd.DataFrame(1/df1['count']).rename(columns={'count':'weight'})
df1 = df1.join(df2)
df = df.merge(df1)

In [4]:
from scipy.sparse import coo_matrix
import numpy as np
row = df.reset_index().values[:,[2]][:,0]
column = df.reset_index().values[:,[1]][:,0]
data = df.reset_index().values[:,[4]][:,0]
M = coo_matrix((data,(row,column)),shape=(10000,10000)).toarray()

In [5]:
def pagerank(M, alpha, num_iter):
    pr = np.ones(M.shape[0]).T
    for i in range(num_iter):
        pr = alpha * M @ pr + (1 - alpha)
    return pr

# Step 3.3

In [6]:
M[10:30,10:30]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  

In [7]:
pr = pagerank(M, 0.85, 15)
pr

array([  2.24702638e+02,   2.79759111e+01,   1.14034108e+01, ...,
         1.57231541e-01,   1.57231541e-01,   1.57231541e-01])

In [8]:
max_df = pd.DataFrame(pr).rename(columns={0:'pagerank'})
max_df_top10 = max_df.sort_values(['pagerank'],ascending = False).head(10)
max_df_top10.reset_index().rename(columns={'index':'id'})

Unnamed: 0,id,pagerank
0,0,224.702638
1,1973,189.250314
2,1790,53.438593
3,1828,50.954873
4,1,27.975911
5,238,26.779136
6,140,23.520898
7,14,22.232264
8,16,21.591054
9,162,18.283386
