In [116]:
import os
import random
import re
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn; seaborn.set_style('whitegrid')
from collections import Counter

from pomegranate import *

corpus_index = 2
DAMPING = 0.85
SAMPLES = 10000

In [117]:
os.getcwd()

'/home/philip/Learning/Computer_Science/CS50_Python/Projects/pagerank'

In [118]:

def crawl(directory):
    """
    Parse a directory of HTML pages and check for links to other pages.
    Return a dictionary where each key is a page, and values are
    a list of all other pages in the corpus that are linked to by the page.
    """
    pages = dict()

    # Extract all links from HTML files
    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            pages[filename] = set(links) - {filename}

    # Only include links to other pages in the corpus
    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )

    return pages



In [119]:
corpus = crawl(f'corpus{corpus_index}')
corpus

{'algorithms.html': {'programming.html', 'recursion.html'},
 'programming.html': {'c.html', 'python.html'},
 'recursion.html': set(),
 'python.html': {'ai.html', 'programming.html'},
 'ai.html': {'algorithms.html', 'inference.html'},
 'logic.html': {'inference.html'},
 'inference.html': {'ai.html'},
 'c.html': {'programming.html'}}

In [120]:
for page in corpus:
    if len(corpus[page])==0:
        corpus[page] = {p for p in corpus.keys()}
corpus

{'algorithms.html': {'programming.html', 'recursion.html'},
 'programming.html': {'c.html', 'python.html'},
 'recursion.html': {'ai.html',
  'algorithms.html',
  'c.html',
  'inference.html',
  'logic.html',
  'programming.html',
  'python.html',
  'recursion.html'},
 'python.html': {'ai.html', 'programming.html'},
 'ai.html': {'algorithms.html', 'inference.html'},
 'logic.html': {'inference.html'},
 'inference.html': {'ai.html'},
 'c.html': {'programming.html'}}

In [121]:
# define the starting probability for which page to stay
start = DiscreteDistribution({
    page: 1/len(corpus)     for page in corpus
})

start

{
    "class" :"Distribution",
    "dtype" :"str",
    "name" :"DiscreteDistribution",
    "parameters" :[
        {
            "algorithms.html" :0.125,
            "programming.html" :0.125,
            "recursion.html" :0.125,
            "python.html" :0.125,
            "ai.html" :0.125,
            "logic.html" :0.125,
            "inference.html" :0.125,
            "c.html" :0.125
        }
    ],
    "frozen" :false
}

In [122]:
# states = {page: State(DiscreteDistribution({'yes': 0.5, 'no': 0.5}), name=page)  
#               for page in corpus}
# states

In [123]:
# Define transition model

transition_list = []
lc = len(corpus)

for page in corpus:
    if len(corpus[page]) == 0:
        for next_page in corpus:
            transition_list.append([page, next_page, 1/lc])
    else:
        
        for next_page in corpus:
            if next_page in corpus[page]:
                transition_list.append([page, next_page, DAMPING/len(corpus[page])+(1-DAMPING)/lc])
            else:
                transition_list.append([page, next_page, (1-DAMPING)/lc])

transitions = ConditionalProbabilityTable(transition_list, [start])


In [124]:
# Create Markov chain
model = MarkovChain([start, transitions])

In [125]:
# Sample 100000 states from chain
result = [(x[0], x[1]/SAMPLES) for x in Counter(model.sample(SAMPLES)).most_common()]
result

[('programming.html', 0.2319),
 ('ai.html', 0.1856),
 ('python.html', 0.1259),
 ('inference.html', 0.1255),
 ('c.html', 0.1254),
 ('algorithms.html', 0.1074),
 ('recursion.html', 0.0711),
 ('logic.html', 0.0272)]