<a href="https://colab.research.google.com/github/momo54/large_scale_data_management/blob/main/PyPageRank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=4264f7c94955c95b11877f5c12afc043f3d29f8c3f3d8e97daf283c85a735c95
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
!pip install -q findspark
import findspark
findspark.init()

# SPARK INSTALLED... lets play

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [4]:
!wget -q https://storage.googleapis.com/public_lddm_data/small_page_links.nt
!ls

sample_data  small_page_links.nt


In [7]:
lines = spark.read.text("small_page_links.nt").rdd.map(lambda r: r[0])
lines.take(5)

['<http://dbpedia.org/resource/AfghanistanHistory> <http://dbpedia.org/property/wikilink> <http://dbpedia.org/resource/History_of_Afghanistan> .',
 '<http://dbpedia.org/resource/AfghanistanGeography> <http://dbpedia.org/property/wikilink> <http://dbpedia.org/resource/Geography_of_Afghanistan> .',
 '<http://dbpedia.org/resource/AccessibleComputing> <http://dbpedia.org/property/wikilink> <http://dbpedia.org/resource/Computer_accessibility> .',
 '<http://dbpedia.org/resource/AfghanistanPeople> <http://dbpedia.org/property/wikilink> <http://dbpedia.org/resource/Demography_of_Afghanistan> .',
 '<http://dbpedia.org/resource/AfghanistanCommunications> <http://dbpedia.org/property/wikilink> <http://dbpedia.org/resource/Communications_in_Afghanistan> .']

In [9]:
import re
def computeContribs(urls, rank) :
    """Calculates URL contributions to the rank of other URLs."""
    num_urls = len(urls)
    for url in urls:
        yield (url, rank / num_urls)


def parseNeighbors(urls) :
    """Parses a urls pair string into urls pair."""
    parts = re.split(r'\s+', urls)
    return parts[0], parts[2]

In [11]:
# Loads all URLs from input file and initialize their neighbors.
links = lines.map(lambda urls: parseNeighbors(urls)).distinct().groupByKey().cache()

# Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
ranks = links.map(lambda url_neighbors: (url_neighbors[0], 1.0))


In [12]:
links.take(5)

[('<http://dbpedia.org/resource/AfghanistanHistory>',
  <pyspark.resultiterable.ResultIterable at 0x7eddc267c100>),
 ('<http://dbpedia.org/resource/AfghanistanGeography>',
  <pyspark.resultiterable.ResultIterable at 0x7eddc267f820>),
 ('<http://dbpedia.org/resource/AccessibleComputing>',
  <pyspark.resultiterable.ResultIterable at 0x7eddc267db70>),
 ('<http://dbpedia.org/resource/AfghanistanPeople>',
  <pyspark.resultiterable.ResultIterable at 0x7eddc267cc70>),
 ('<http://dbpedia.org/resource/AfghanistanCommunications>',
  <pyspark.resultiterable.ResultIterable at 0x7eddc267e1d0>)]

In [26]:
#groupByKey makes lists !!
links.map(lambda x: (x[0],list(x[1]))).take(5)

[('<http://dbpedia.org/resource/AfghanistanHistory>',
  ['<http://dbpedia.org/resource/History_of_Afghanistan>']),
 ('<http://dbpedia.org/resource/AfghanistanGeography>',
  ['<http://dbpedia.org/resource/Geography_of_Afghanistan>']),
 ('<http://dbpedia.org/resource/AccessibleComputing>',
  ['<http://dbpedia.org/resource/Computer_accessibility>']),
 ('<http://dbpedia.org/resource/AfghanistanPeople>',
  ['<http://dbpedia.org/resource/Demography_of_Afghanistan>']),
 ('<http://dbpedia.org/resource/AfghanistanCommunications>',
  ['<http://dbpedia.org/resource/Communications_in_Afghanistan>'])]

In [34]:
#groupByKey makes lists !!
links.map(lambda x: (x[0],len(list(x[1])))).sortBy(lambda x:x[1],ascending=False).take(10)

[('<http://dbpedia.org/resource/Academy_Award_for_Best_Art_Direction>', 1105),
 ('<http://dbpedia.org/resource/August_15>', 678),
 ('<http://dbpedia.org/resource/Afghanistan>', 642),
 ('<http://dbpedia.org/resource/Azerbaijan>', 642),
 ('<http://dbpedia.org/resource/Austria>', 638),
 ('<http://dbpedia.org/resource/Amsterdam>', 632),
 ('<http://dbpedia.org/resource/April_1>', 632),
 ('<http://dbpedia.org/resource/August_24>', 628),
 ('<http://dbpedia.org/resource/August_1>', 627),
 ('<http://dbpedia.org/resource/August_19>', 610)]

In [13]:
ranks.take(5)

[('<http://dbpedia.org/resource/AfghanistanHistory>', 1.0),
 ('<http://dbpedia.org/resource/AfghanistanGeography>', 1.0),
 ('<http://dbpedia.org/resource/AccessibleComputing>', 1.0),
 ('<http://dbpedia.org/resource/AfghanistanPeople>', 1.0),
 ('<http://dbpedia.org/resource/AfghanistanCommunications>', 1.0)]

In [28]:
links.join(ranks).take(5)

[('<http://dbpedia.org/resource/Anarchism>',
  (<pyspark.resultiterable.ResultIterable at 0x7edda1525270>,
   0.16135928302772604)),
 ('<http://dbpedia.org/resource/Autism>',
  (<pyspark.resultiterable.ResultIterable at 0x7edda15252d0>,
   0.15942836879432623)),
 ('<http://dbpedia.org/resource/An_American_in_Paris>',
  (<pyspark.resultiterable.ResultIterable at 0x7edda1525240>, 1.0)),
 ('<http://dbpedia.org/resource/Academy_Award_for_Best_Art_Direction>',
  (<pyspark.resultiterable.ResultIterable at 0x7edda1525210>,
   0.15188470066518847)),
 ('<http://dbpedia.org/resource/Academy_Award>',
  (<pyspark.resultiterable.ResultIterable at 0x7edda1525330>,
   1.0306257530637937))]

In [29]:
links.join(ranks).flatMap(lambda url_urls_rank: computeContribs(
            url_urls_rank[1][0], url_urls_rank[1][1]  # type: ignore[arg-type]
        )).take(5)

[('<http://dbpedia.org/resource/Diq:Anar%C5%9Fizm>', 0.0004494687549518831),
 ('<http://dbpedia.org/resource/War:Anarkismo>', 0.0004494687549518831),
 ('<http://dbpedia.org/resource/Fiu-vro:Anarkism>', 0.0004494687549518831),
 ('<http://dbpedia.org/resource/Sco:Anarchism>', 0.0004494687549518831),
 ('<http://dbpedia.org/resource/Sah:%D0%90%D0%BD%D0%B0%D1%80%D1%85%D0%B8%D0%B7%D0%BC>',
  0.0004494687549518831)]

In [15]:
from operator import add
for iteration in range(1):
  # Calculates URL contributions to the rank of other URLs.
  contribs = links.join(ranks).flatMap(lambda url_urls_rank: computeContribs(
            url_urls_rank[1][0], url_urls_rank[1][1]  # type: ignore[arg-type]
        ))

  # Re-calculates URL ranks based on neighbor contributions.
  ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15)

    # Collects all URL ranks and dump them to console.
for (link, rank) in ranks.collect():
  print("%s has rank: %s." % (link, rank))


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
<http://dbpedia.org/resource/Sedimentary> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Geological> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Elasticity_%28physics%29> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Optical_axis> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Crystal_optics> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Birefringence> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Liquid_crystals> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Magnetic_anisotropy> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Physicists> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Cosmic_microwave_background_radiation> has rank: 0.17014383561643837.
<http://dbpedia.org/resource/Proximal> has rank: 0.16164383561643836.
<http://dbpedia.org/resource/Filter_%28chemistry%29> 