<a href="https://colab.research.google.com/github/momo54/large_scale_data_management/blob/main/PageRank_an_PySPARK_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=b1f220259879e64d30e05707df2b7665ade5888904fca399177cd15ac0a4c373
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
!pip install -q findspark
import findspark
findspark.init()

In [None]:
!wget -q https://storage.googleapis.com/public_lddm_data/small_page_links.nt
!ls

sample_data  small_page_links.nt


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Créez une session Spark
spark = SparkSession.builder.appName("PageRankExample").getOrCreate()


In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
schema = StructType([
    StructField("source", StringType(), nullable=True),
    StructField("predicate", StringType(), nullable=True),
    StructField("target", StringType(), nullable=True)
])

In [None]:
# Chargez vos données web en tant que DataFrame
# Supposons que vous ayez un DataFrame avec deux colonnes : 'source' et 'target' représentant les liens entre les pages web
# Par exemple, vous pouvez le charger à partir d'un fichier CSV
data = spark.read.option("delimiter"," ").csv("small_page_links.nt", header=False, schema=schema)
data.show(5,truncate=200)

+-------------------------------------------------------+--------------------------------------+-----------------------------------------------------------+
|                                                 source|                             predicate|                                                     target|
+-------------------------------------------------------+--------------------------------------+-----------------------------------------------------------+
|       <http://dbpedia.org/resource/AfghanistanHistory>|<http://dbpedia.org/property/wikilink>|       <http://dbpedia.org/resource/History_of_Afghanistan>|
|     <http://dbpedia.org/resource/AfghanistanGeography>|<http://dbpedia.org/property/wikilink>|     <http://dbpedia.org/resource/Geography_of_Afghanistan>|
|      <http://dbpedia.org/resource/AccessibleComputing>|<http://dbpedia.org/property/wikilink>|       <http://dbpedia.org/resource/Computer_accessibility>|
|        <http://dbpedia.org/resource/AfghanistanPeople>|<

In [None]:
# So we can really write SQL !!
data.createOrReplaceTempView("SPO")
result=spark.sql("select source from SPO")
result.show(5)

+--------------------+
|              source|
+--------------------+
|<http://dbpedia.o...|
|<http://dbpedia.o...|
|<http://dbpedia.o...|
|<http://dbpedia.o...|
|<http://dbpedia.o...|
+--------------------+
only showing top 5 rows



In [None]:
data.take(1)

[Row(source='<http://dbpedia.org/resource/AfghanistanHistory>', predicate='<http://dbpedia.org/property/wikilink>', target='<http://dbpedia.org/resource/History_of_Afghanistan>')]

In [None]:
# Créez un DataFrame contenant le nombre de liens sortants pour chaque page
outdegrees = data.groupBy("source").count().withColumnRenamed("source", "page").withColumnRenamed("count", "outDegree")

# Définissez le nombre d'itérations pour le calcul du PageRank
max_iterations = 1
damping_factor = 0.85

# Initialisation du PageRank en attribuant à chaque page une valeur de départ
initial_pagerank = 1.0

# Créez un DataFrame contenant les valeurs de PageRank initiales
pagerank = outdegrees.withColumn("pagerank", col("outDegree") / initial_pagerank)

pagerank.show(5)


+--------------------+---------+--------+
|                page|outDegree|pagerank|
+--------------------+---------+--------+
|<http://dbpedia.o...|      197|   197.0|
|<http://dbpedia.o...|        1|     1.0|
|<http://dbpedia.o...|       35|    35.0|
|<http://dbpedia.o...|        1|     1.0|
|<http://dbpedia.o...|        1|     1.0|
+--------------------+---------+--------+
only showing top 5 rows



In [None]:
# Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")

new_pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")

# Joignez le DataFrame "new_pagerank" avec le DataFrame "outdegrees" pour obtenir les "outDegree" appropriés
pagerank = new_pagerank.join(outdegrees, new_pagerank.page == outdegrees.page, "left").select(new_pagerank.page, new_pagerank.pagerank, outdegrees.outDegree)

# Appliquez la formule du PageRank
pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") / col("outDegree"))


In [None]:
 # Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")
contrib.show(2,truncate=100)

# Calculez le nouveau PageRank
pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")
pagerank.show(2)

# Appliquez la formule du PageRank
pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") /  pagerank["outDegree"])
pagerank.show(2)

+--------------------------------------------------+--------+
|                                            source|pagerank|
+--------------------------------------------------+--------+
|  <http://dbpedia.org/resource/AfghanistanHistory>|    NULL|
|<http://dbpedia.org/resource/AfghanistanGeography>|    NULL|
+--------------------------------------------------+--------+
only showing top 2 rows

+--------------------+---------+
|                page| pagerank|
+--------------------+---------+
|<http://dbpedia.o...|     NULL|
|<http://dbpedia.o...|3374891.0|
+--------------------+---------+
only showing top 2 rows



AnalysisException: ignored

In [None]:
# Effectuez des itérations pour calculer le PageRank
for iteration in range(max_iterations):
    # Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
    contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")

    # Calculez le nouveau PageRank
    pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")

    # Appliquez la formule du PageRank
    #pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") / outdegrees["outDegree"])

# Affichez les résultats
pagerank.select("page", "pagerank").show()

# Arrêtez la session Spark
# spark.stop()

+--------------------+----------------+
|                page|        pagerank|
+--------------------+----------------+
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...|1.34888295669E11|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...|     3.9950588E9|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...| 4.4257114758E11|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...|          3200.0|
|<http://dbpedia.o...|8.65115911539E11|
|<http://dbpedia.o...|3.19355551182E11|
|<http://dbpedia.o...|           103.0|
|<http://dbpedia.o...| 1.5766337839E10|
|<http://dbpedia.o...|2.67332593952E11|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...| 9.1540640697E10|
|<http://dbpedia.o...|9.63015028858E11|
|<http://dbpedia.o...|            NULL|
|<http://dbpedia.o...|            NULL|
+--------------------+----------------+
only showing top 20 rows

