<a href="https://colab.research.google.com/github/momo54/large_scale_data_management/blob/main/PageRank_an_PySPARK_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install pyspark



In [36]:
!pip install -q findspark
import findspark
findspark.init()

In [37]:
!wget -q https://storage.googleapis.com/public_lddm_data/small_page_links.nt
!ls

sample_data  small_page_links.nt  small_page_links.nt.1


In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Créez une session Spark
spark = SparkSession.builder.appName("PageRankExample").getOrCreate()


In [39]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
schema = StructType([
    StructField("source", StringType(), nullable=True),
    StructField("predicate", StringType(), nullable=True),
    StructField("target", StringType(), nullable=True)
])

In [40]:
# Chargez vos données web en tant que DataFrame
# Supposons que vous ayez un DataFrame avec deux colonnes : 'source' et 'target' représentant les liens entre les pages web
# Par exemple, vous pouvez le charger à partir d'un fichier CSV
data = spark.read.option("delimiter"," ").csv("small_page_links.nt", header=False, schema=schema)
data.show(5,truncate=200)

+-------------------------------------------------------+--------------------------------------+-----------------------------------------------------------+
|                                                 source|                             predicate|                                                     target|
+-------------------------------------------------------+--------------------------------------+-----------------------------------------------------------+
|       <http://dbpedia.org/resource/AfghanistanHistory>|<http://dbpedia.org/property/wikilink>|       <http://dbpedia.org/resource/History_of_Afghanistan>|
|     <http://dbpedia.org/resource/AfghanistanGeography>|<http://dbpedia.org/property/wikilink>|     <http://dbpedia.org/resource/Geography_of_Afghanistan>|
|      <http://dbpedia.org/resource/AccessibleComputing>|<http://dbpedia.org/property/wikilink>|       <http://dbpedia.org/resource/Computer_accessibility>|
|        <http://dbpedia.org/resource/AfghanistanPeople>|<

In [41]:
# So we can really write SQL !!
data.createOrReplaceTempView("SPO")
result=spark.sql("select source from SPO")
result.show(5)

+--------------------+
|              source|
+--------------------+
|<http://dbpedia.o...|
|<http://dbpedia.o...|
|<http://dbpedia.o...|
|<http://dbpedia.o...|
|<http://dbpedia.o...|
+--------------------+
only showing top 5 rows



In [None]:
data.take(1)

[Row(source='<http://dbpedia.org/resource/AfghanistanHistory>', predicate='<http://dbpedia.org/property/wikilink>', target='<http://dbpedia.org/resource/History_of_Afghanistan>')]

In [50]:
# Créez un DataFrame contenant le nombre de liens sortants pour chaque page
outdegrees = data.groupBy("source").count().withColumnRenamed("source", "page").withColumnRenamed("count", "outDegree")

# Définissez le nombre d'itérations pour le calcul du PageRank
max_iterations = 3
damping_factor = 0.85

# Initialisation du PageRank en attribuant à chaque page une valeur de départ
initial_pagerank = 1.0

# Créez un DataFrame contenant les valeurs de PageRank initiales
pagerank = outdegrees.withColumn("pagerank", col("outDegree") / initial_pagerank)

pagerank.show(5,truncate=100)


+-----------------------------------------------------+---------+--------+
|                                                 page|outDegree|pagerank|
+-----------------------------------------------------+---------+--------+
|         <http://dbpedia.org/resource/Actinopterygii>|      197|   197.0|
|<http://dbpedia.org/resource/AtlasShruggedCharacters>|        1|     1.0|
|             <http://dbpedia.org/resource/Allegiance>|       35|    35.0|
|                  <http://dbpedia.org/resource/AbboT>|        1|     1.0|
|        <http://dbpedia.org/resource/AcademicElitism>|        1|     1.0|
+-----------------------------------------------------+---------+--------+
only showing top 5 rows



In [49]:
# Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")

new_pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")

# Joignez le DataFrame "new_pagerank" avec le DataFrame "outdegrees" pour obtenir les "outDegree" appropriés
pagerank = new_pagerank.join(outdegrees, new_pagerank.page == outdegrees.page, "left").select(new_pagerank.page, new_pagerank.pagerank, outdegrees.outDegree)

# Appliquez la formule du PageRank
pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") / col("outDegree"))
pagerank.show(5)

+--------------------+-----------------+---------+
|                page|         pagerank|outDegree|
+--------------------+-----------------+---------+
|<http://dbpedia.o...|             NULL|      197|
|<http://dbpedia.o...|39973.37172131147|        1|
|<http://dbpedia.o...|             NULL|       35|
|<http://dbpedia.o...|810.6272849462365|        1|
|<http://dbpedia.o...|             NULL|        1|
+--------------------+-----------------+---------+
only showing top 5 rows



In [47]:
# quand le chat dit des bétises...
 # Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")
contrib.show(2,truncate=100)

# Calculez le nouveau PageRank
pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")
pagerank.show(2)

# Appliquez la formule du PageRank
pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") /  pagerank["outDegree"])
pagerank.show(2)

+--------------------------------------------------+--------+
|                                            source|pagerank|
+--------------------------------------------------+--------+
|  <http://dbpedia.org/resource/AfghanistanHistory>|    NULL|
|<http://dbpedia.org/resource/AfghanistanGeography>|    NULL|
+--------------------------------------------------+--------+
only showing top 2 rows

+--------------------+--------+
|                page|pagerank|
+--------------------+--------+
|<http://dbpedia.o...|    NULL|
|<http://dbpedia.o...| 50594.0|
+--------------------+--------+
only showing top 2 rows



AnalysisException: ignored

In [53]:
import time
max_iterations = 10
debut = time.time()

# Effectuez des itérations pour calculer le PageRank

for iteration in range(max_iterations):
  # Rejoignez le DataFrame pagerank avec le DataFrame data pour calculer la contribution à partir des liens entrants
  contrib = data.join(pagerank, data.target == pagerank.page, "left").select("source", "pagerank")

  new_pagerank = contrib.groupBy("source").sum("pagerank").withColumnRenamed("source", "page").withColumnRenamed("sum(pagerank)", "pagerank")

  # Joignez le DataFrame "new_pagerank" avec le DataFrame "outdegrees" pour obtenir les "outDegree" appropriés
  pagerank = new_pagerank.join(outdegrees, new_pagerank.page == outdegrees.page, "left").select(new_pagerank.page, new_pagerank.pagerank, outdegrees.outDegree)

  # Appliquez la formule du PageRank
  pagerank = pagerank.withColumn("pagerank", (1 - damping_factor) + damping_factor * col("pagerank") / col("outDegree"))


# Affichez les résultats
pagerank.select("page", "pagerank").show()
fin = time.time()
print(f"Temps d'exécution : {fin-debut} secondes")
# Arrêtez la session Spark
# spark.stop()

+--------------------+-------------------+
|                page|           pagerank|
+--------------------+-------------------+
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.28111708845019723|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.27989073867732917|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.15577653203305658|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.15589785108855592|
|<http://dbpedia.o...|0.15595562103322655|
|<http://dbpedia.o...|0.15521238381856947|
|<http://dbpedia.o...| 0.1512481644640235|
|<http://dbpedia.o...|0.15288306779854458|
|<http://dbpedia.o...|0.15363072921406307|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|0.15071076181142143|
|<http://dbpedia.o...|0.15159132688549384|
|<http://dbpedia.o...|               NULL|
|<http://dbpedia.o...|               NULL|
+----------