In [31]:
from pyspark.sql import SparkSession

In [32]:
SparkSession

pyspark.sql.session.SparkSession

In [33]:
# Initialisation de Spark
import findspark
import json 
import os 
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NotebookTest") \
    .master("spark://spark-master:7077") \
    .getOrCreate()



print("Session Spark connectée :", spark.version)


Session Spark connectée : 3.5.7


In [34]:
base_nosql = spark.read.option("header", "true").csv("hdfs://namenode:8020/data_collection/base_nosql.csv")

                                                                                

In [35]:
base_nosql.show()

[Stage 3:>                                                          (0 + 1) / 1]

+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
| nom| prenom|age|sexe|      ville|     maladie|             hopital|date_consultation|   medecin|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
|Diop|    Awa| 32|   F|      Dakar|     Diabète|   Hôpital Principal|       2025-10-10|Dr. Ndiaye|
|Fall|Mamadou| 45|   M|      Thiès|Hypertension|Hôpital Régional ...|       2025-09-22|Dr. Diallo|
| Sow|  Fatou| 28|   F|Saint-Louis|   Paludisme|Centre de Santé d...|       2025-10-02|    Dr. Sy|
|Diop|    Awa| 32|   F|      Dakar|     Diabète|   Hôpital Principal|       2025-10-10|Dr. Ndiaye|
|Fall|Mamadou| 45|   M|      Thiès|Hypertension|Hôpital Régional ...|       2025-09-22|Dr. Diallo|
| Sow|  Fatou| 28|   F|Saint-Louis|   Paludisme|Centre de Santé d...|       2025-10-02|    Dr. Sy|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+



                                                                                

In [36]:
#Voir la structure des données
base_nosql.printSchema()



root
 |-- nom: string (nullable = true)
 |-- prenom: string (nullable = true)
 |-- age: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- ville: string (nullable = true)
 |-- maladie: string (nullable = true)
 |-- hopital: string (nullable = true)
 |-- date_consultation: string (nullable = true)
 |-- medecin: string (nullable = true)



In [37]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import IntegerType

# Convertir age en entier
base_nosql = base_nosql.withColumn("age", col("age").cast(IntegerType()))

# Convertir date_consultation en date
base_nosql = base_nosql.withColumn("date_consultation", to_date(col("date_consultation"), "yyyy-MM-dd"))

base_nosql.printSchema()

root
 |-- nom: string (nullable = true)
 |-- prenom: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- ville: string (nullable = true)
 |-- maladie: string (nullable = true)
 |-- hopital: string (nullable = true)
 |-- date_consultation: date (nullable = true)
 |-- medecin: string (nullable = true)



In [38]:
#Voir le nombre total de lignes
base_nosql.count()

6

In [39]:
#supprimer les doublons

base_nosql = base_nosql.dropDuplicates()
base_nosql.show()



+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
| nom| prenom|age|sexe|      ville|     maladie|             hopital|date_consultation|   medecin|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
|Diop|    Awa| 32|   F|      Dakar|     Diabète|   Hôpital Principal|       2025-10-10|Dr. Ndiaye|
| Sow|  Fatou| 28|   F|Saint-Louis|   Paludisme|Centre de Santé d...|       2025-10-02|    Dr. Sy|
|Fall|Mamadou| 45|   M|      Thiès|Hypertension|Hôpital Régional ...|       2025-09-22|Dr. Diallo|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+



In [40]:
#Nettoyer les espaces dans toutes les colonnes texte
from pyspark.sql.functions import trim

for col_name in base_nosql.columns:
    base_nosql = base_nosql.withColumn(col_name, trim(col(col_name)))
base_nosql.show()


+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
| nom| prenom|age|sexe|      ville|     maladie|             hopital|date_consultation|   medecin|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
|Diop|    Awa| 32|   F|      Dakar|     Diabète|   Hôpital Principal|       2025-10-10|Dr. Ndiaye|
| Sow|  Fatou| 28|   F|Saint-Louis|   Paludisme|Centre de Santé d...|       2025-10-02|    Dr. Sy|
|Fall|Mamadou| 45|   M|      Thiès|Hypertension|Hôpital Régional ...|       2025-09-22|Dr. Diallo|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+



In [41]:
#Nettoyer les espaces dans toutes les colonnes texte
from pyspark.sql.functions import trim

for col_name in base_nosql.columns:
    base_nosql = base_nosql.withColumn(col_name, trim(col(col_name)))
base_nosql.show()


+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
| nom| prenom|age|sexe|      ville|     maladie|             hopital|date_consultation|   medecin|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
|Diop|    Awa| 32|   F|      Dakar|     Diabète|   Hôpital Principal|       2025-10-10|Dr. Ndiaye|
| Sow|  Fatou| 28|   F|Saint-Louis|   Paludisme|Centre de Santé d...|       2025-10-02|    Dr. Sy|
|Fall|Mamadou| 45|   M|      Thiès|Hypertension|Hôpital Régional ...|       2025-09-22|Dr. Diallo|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+



In [42]:
#Mettre la première lettre en Majuscule pour les colonnes texte (nom, prenom, ville, maladie, hopital, medecin, sexe)
from pyspark.sql.functions import initcap

cols_to_format = ["nom", "prenom", "ville", "maladie", "hopital", "medecin", "sexe"]

for c in cols_to_format:
    base_nosql = base_nosql.withColumn(c, initcap(col(c)))

base_nosql.show()


+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
| nom| prenom|age|sexe|      ville|     maladie|             hopital|date_consultation|   medecin|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+
|Diop|    Awa| 32|   F|      Dakar|     Diabète|   Hôpital Principal|       2025-10-10|Dr. Ndiaye|
| Sow|  Fatou| 28|   F|Saint-louis|   Paludisme|Centre De Santé D...|       2025-10-02|    Dr. Sy|
|Fall|Mamadou| 45|   M|      Thiès|Hypertension|Hôpital Régional ...|       2025-09-22|Dr. Diallo|
+----+-------+---+----+-----------+------------+--------------------+-----------------+----------+



In [43]:
base_sql = spark.read.option("header", "true").csv("hdfs://namenode:8020/data_collection/base_sql.csv")
base_sql.show()

+---+-----------------+---+----+---------------+-----------------+
| id|              nom|age|sexe|        maladie|date_consultation|
+---+-----------------+---+----+---------------+-----------------+
|  1|Catherine Michaud| 73|   F|         Asthme|       2024-10-28|
|  2|  Richard Thierry| 84|   M|    Tuberculose|       2024-07-25|
|  3|    Aurore Brunet| 26|   F|            VIH|       2024-09-10|
|  4|   Tristan Renaud| 31|   F|         Asthme|       2025-11-02|
|  5|     Marine Neveu| 38|   F|            VIH|       2025-04-29|
|  6| Marthe Letellier| 60|   M|        Diabète|       2024-04-22|
|  7|    Astrid Perret|  4|   F|Gastro-entérite|       2025-05-02|
|  8|   Marcelle Costa| 40|   F|       Covid-19|       2024-03-20|
|  9|   Nathalie David| 34|   F|        Choléra|       2024-08-27|
| 10|    Victor Menard| 55|   M|    Tuberculose|       2025-05-09|
| 11|  Édouard Vasseur| 61|   M|   Hypertension|       2024-01-01|
| 12|       David Blot| 38|   F|         Grippe|       2025-01

In [44]:
# voir la structure des données
base_sql.printSchema()

root
 |-- id: string (nullable = true)
 |-- nom: string (nullable = true)
 |-- age: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- maladie: string (nullable = true)
 |-- date_consultation: string (nullable = true)



In [45]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import IntegerType

# Convertir age en entier
base_sql = base_sql.withColumn("age", col("age").cast(IntegerType()))

# Convertir id en entier
base_sql = base_sql.withColumn("id", col("id").cast(IntegerType()))


# Convertir date_consultation en date
base_sql = base_sql.withColumn("date_consultation", to_date(col("date_consultation"), "yyyy-MM-dd"))

base_sql.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nom: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- maladie: string (nullable = true)
 |-- date_consultation: date (nullable = true)



In [46]:
#voir le nombre total de lignes
base_sql.count()

200

In [47]:
# Supprimer les doublons
base_sql = base_sql.dropDuplicates()
base_sql.show()

+---+-----------------+---+----+---------------+-----------------+
| id|              nom|age|sexe|        maladie|date_consultation|
+---+-----------------+---+----+---------------+-----------------+
|172|      Pierre Blin| 12|   M|        Diabète|       2025-05-06|
| 51|      René Briand| 62|   M|        Diabète|       2024-11-18|
| 88|  Astrid Delannoy| 79|   F|        Choléra|       2024-06-14|
|190|     Xavier Caron| 77|   F|        Diabète|       2025-04-15|
|168| Pénélope Guillot| 62|   F|        Diabète|       2024-05-25|
|174| Matthieu Regnier|  7|   F|Gastro-entérite|       2024-08-04|
|123|      Jean Fleury|  7|   F|         Anémie|       2024-09-27|
|  5|     Marine Neveu| 38|   F|            VIH|       2025-04-29|
|120| Juliette Michaud| 65|   M|         Grippe|       2025-03-28|
|  1|Catherine Michaud| 73|   F|         Asthme|       2024-10-28|
| 20|   Gilles Bonneau| 37|   F|        Diabète|       2025-01-24|
|160|    Andrée Briand| 55|   F|      Paludisme|       2024-12

In [48]:
base_sql.count()

200

In [49]:
#Nettoyer les espaces dans toutes les colonnes texte
from pyspark.sql.functions import trim

for col_name in base_sql.columns:
    base_sql = base_sql.withColumn(col_name, trim(col(col_name)))
base_sql.show()


+---+-----------------+---+----+---------------+-----------------+
| id|              nom|age|sexe|        maladie|date_consultation|
+---+-----------------+---+----+---------------+-----------------+
|172|      Pierre Blin| 12|   M|        Diabète|       2025-05-06|
| 51|      René Briand| 62|   M|        Diabète|       2024-11-18|
| 88|  Astrid Delannoy| 79|   F|        Choléra|       2024-06-14|
|190|     Xavier Caron| 77|   F|        Diabète|       2025-04-15|
|168| Pénélope Guillot| 62|   F|        Diabète|       2024-05-25|
|174| Matthieu Regnier|  7|   F|Gastro-entérite|       2024-08-04|
|123|      Jean Fleury|  7|   F|         Anémie|       2024-09-27|
|  5|     Marine Neveu| 38|   F|            VIH|       2025-04-29|
|120| Juliette Michaud| 65|   M|         Grippe|       2025-03-28|
|  1|Catherine Michaud| 73|   F|         Asthme|       2024-10-28|
| 20|   Gilles Bonneau| 37|   F|        Diabète|       2025-01-24|
|160|    Andrée Briand| 55|   F|      Paludisme|       2024-12

In [50]:
#Mettre la première lettre en Majuscule pour les colonnes texte (nom, prenom, ville, maladie, hopital, medecin, sexe)
from pyspark.sql.functions import initcap

cols_to_format = ["nom", "maladie", "sexe"]

for c in cols_to_format:
    base_sql = base_sql.withColumn(c, initcap(col(c)))

base_sql.show()

+---+-----------------+---+----+---------------+-----------------+
| id|              nom|age|sexe|        maladie|date_consultation|
+---+-----------------+---+----+---------------+-----------------+
|172|      Pierre Blin| 12|   M|        Diabète|       2025-05-06|
| 51|      René Briand| 62|   M|        Diabète|       2024-11-18|
| 88|  Astrid Delannoy| 79|   F|        Choléra|       2024-06-14|
|190|     Xavier Caron| 77|   F|        Diabète|       2025-04-15|
|168| Pénélope Guillot| 62|   F|        Diabète|       2024-05-25|
|174| Matthieu Regnier|  7|   F|Gastro-entérite|       2024-08-04|
|123|      Jean Fleury|  7|   F|         Anémie|       2024-09-27|
|  5|     Marine Neveu| 38|   F|            Vih|       2025-04-29|
|120| Juliette Michaud| 65|   M|         Grippe|       2025-03-28|
|  1|Catherine Michaud| 73|   F|         Asthme|       2024-10-28|
| 20|   Gilles Bonneau| 37|   F|        Diabète|       2025-01-24|
|160|    Andrée Briand| 55|   F|      Paludisme|       2024-12

In [51]:
donnee_sante = spark.read.option("header", "true").csv("hdfs://namenode:8020/data_collection/donnee_sante.csv")
donnee_sante.show()

+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| id|     region|             hopital|sexe|age|     maladie|date_consultation|   traitement|cout_consultation|
+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
|  1| Ziguinchor|Hôpital Aristide ...|   M|  5|     Diabète|       2024-04-22|     Quinimax|            13727|
|  2|      Dakar|   Hôpital Principal|   F| 12|Hypertension|       2024-08-28|     Quinimax|             2920|
|  3|Tambacounda|Hôpital de Ziguin...|   M| 85|   Paludisme|       2024-01-08|  Paracétamol|             4863|
|  4|      Dakar|Hôpital Aristide ...|   F| 27|   Paludisme|       2024-02-07|Antibiotiques|            17624|
|  5|      Thiès|    Hôpital Régional|   M| 40| Tuberculose|       2024-08-21|Antibiotiques|             7540|
|  6|      Dakar|Centre de Santé T...|   M| 21|Hypertension|       2024-02-04|     Quinimax|             9731|
|

In [52]:
donnee_sante.printSchema()

root
 |-- id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- hopital: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: string (nullable = true)
 |-- maladie: string (nullable = true)
 |-- date_consultation: string (nullable = true)
 |-- traitement: string (nullable = true)
 |-- cout_consultation: string (nullable = true)



In [53]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import IntegerType

# Convertir age en entier
donnee_sante = donnee_sante.withColumn("age", col("age").cast(IntegerType()))

# Convertir id en entier
donnee_sante = donnee_sante.withColumn("id", col("id").cast(IntegerType()))

# Convertir cout_consultation en entier
donnee_sante = donnee_sante.withColumn("cout_consultation", col("cout_consultation").cast(IntegerType()))


# Convertir date_consultation en date
donnee_sante = donnee_sante.withColumn("date_consultation", to_date(col("date_consultation"), "yyyy-MM-dd"))

donnee_sante.printSchema()

root
 |-- id: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- hopital: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- maladie: string (nullable = true)
 |-- date_consultation: date (nullable = true)
 |-- traitement: string (nullable = true)
 |-- cout_consultation: integer (nullable = true)



In [54]:
# Supprimer les doublons
donnee_sante = donnee_sante.dropDuplicates()
donnee_sante.show()

+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| id|     region|             hopital|sexe|age|     maladie|date_consultation|   traitement|cout_consultation|
+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| 92|Saint-Louis|   Hôpital Principal|   F| 16|      Grippe|       2024-10-01|  Paracétamol|            12485|
| 97|      Dakar|Hôpital Aristide ...|   M| 46|     Diabète|       2024-09-24|        Repos|            17703|
| 22|    Kaolack|    Hôpital Régional|   M| 30|Hypertension|       2024-07-04|     Quinimax|             3894|
| 90|    Kaolack|    Hôpital Régional|   F| 11|    Covid-19|       2024-09-10|  Chloroquine|            16685|
|  6|      Dakar|Centre de Santé T...|   M| 21|Hypertension|       2024-02-04|     Quinimax|             9731|
| 59|      Dakar|   Hôpital Principal|   M|  8| Tuberculose|       2024-06-05|Antibiotiques|             4484|
|

In [55]:
donnee_sante.count()

100

In [56]:
#Nettoyer les espaces dans toutes les colonnes texte
from pyspark.sql.functions import trim

for col_name in donnee_sante.columns:
    donnee_sante = donnee_sante.withColumn(col_name, trim(col(col_name)))
donnee_sante.show()

+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| id|     region|             hopital|sexe|age|     maladie|date_consultation|   traitement|cout_consultation|
+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| 92|Saint-Louis|   Hôpital Principal|   F| 16|      Grippe|       2024-10-01|  Paracétamol|            12485|
| 97|      Dakar|Hôpital Aristide ...|   M| 46|     Diabète|       2024-09-24|        Repos|            17703|
| 22|    Kaolack|    Hôpital Régional|   M| 30|Hypertension|       2024-07-04|     Quinimax|             3894|
| 90|    Kaolack|    Hôpital Régional|   F| 11|    Covid-19|       2024-09-10|  Chloroquine|            16685|
|  6|      Dakar|Centre de Santé T...|   M| 21|Hypertension|       2024-02-04|     Quinimax|             9731|
| 59|      Dakar|   Hôpital Principal|   M|  8| Tuberculose|       2024-06-05|Antibiotiques|             4484|
|

In [57]:
#Mettre la première lettre en Majuscule pour les colonnes texte (nom, prenom, ville, maladie, hopital, medecin, sexe)
from pyspark.sql.functions import initcap

cols_to_format = ["region", "hopital", "sexe", "maladie", "traitement"]

for c in cols_to_format:
   donnee_sante = donnee_sante.withColumn(c, initcap(col(c)))

donnee_sante.show()

+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| id|     region|             hopital|sexe|age|     maladie|date_consultation|   traitement|cout_consultation|
+---+-----------+--------------------+----+---+------------+-----------------+-------------+-----------------+
| 92|Saint-louis|   Hôpital Principal|   F| 16|      Grippe|       2024-10-01|  Paracétamol|            12485|
| 97|      Dakar|Hôpital Aristide ...|   M| 46|     Diabète|       2024-09-24|        Repos|            17703|
| 22|    Kaolack|    Hôpital Régional|   M| 30|Hypertension|       2024-07-04|     Quinimax|             3894|
| 90|    Kaolack|    Hôpital Régional|   F| 11|    Covid-19|       2024-09-10|  Chloroquine|            16685|
|  6|      Dakar|Centre De Santé T...|   M| 21|Hypertension|       2024-02-04|     Quinimax|             9731|
| 59|      Dakar|   Hôpital Principal|   M|  8| Tuberculose|       2024-06-05|Antibiotiques|             4484|
|

In [58]:
donnees_sanitaires_api = spark.read.option("header", "true").csv("hdfs://namenode:8020/data_collection/donnees_sanitaires_api.csv")
donnees_sanitaires_api.show()

+-------------+-------------------+--------------------+--------+----------+------+-----------+---------+--------------+-------+--------+------------------+-------------------+---------+------------------+----------+-----------------+----------------+-----------------+----------------+-------------------+----------------------+---------------------+
|      updated|            country|         countryInfo|   cases|todayCases|deaths|todayDeaths|recovered|todayRecovered| active|critical|casesPerOneMillion|deathsPerOneMillion|    tests|testsPerOneMillion|population|        continent|oneCasePerPeople|oneDeathPerPeople|oneTestPerPeople|activePerOneMillion|recoveredPerOneMillion|criticalPerOneMillion|
+-------------+-------------------+--------------------+--------+----------+------+-----------+---------+--------------+-------+--------+------------------+-------------------+---------+------------------+----------+-----------------+----------------+-----------------+----------------+----------

In [59]:
donnees_sanitaires_api.printSchema()

root
 |-- updated: string (nullable = true)
 |-- country: string (nullable = true)
 |-- countryInfo: string (nullable = true)
 |-- cases: string (nullable = true)
 |-- todayCases: string (nullable = true)
 |-- deaths: string (nullable = true)
 |-- todayDeaths: string (nullable = true)
 |-- recovered: string (nullable = true)
 |-- todayRecovered: string (nullable = true)
 |-- active: string (nullable = true)
 |-- critical: string (nullable = true)
 |-- casesPerOneMillion: string (nullable = true)
 |-- deathsPerOneMillion: string (nullable = true)
 |-- tests: string (nullable = true)
 |-- testsPerOneMillion: string (nullable = true)
 |-- population: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- oneCasePerPeople: string (nullable = true)
 |-- oneDeathPerPeople: string (nullable = true)
 |-- oneTestPerPeople: string (nullable = true)
 |-- activePerOneMillion: string (nullable = true)
 |-- recoveredPerOneMillion: string (nullable = true)
 |-- criticalPerOneMillion: 

In [60]:
from pyspark.sql.functions import col

# Colonnes ENTIERES (int)
cols_int = [
    "cases","todayCases","deaths","todayDeaths","recovered","todayRecovered",
    "active","critical","tests","population","oneCasePerPeople",
    "oneDeathPerPeople","oneTestPerPeople"
]

# Colonnes DECIMALES (double)
cols_double = [
    "casesPerOneMillion","deathsPerOneMillion","testsPerOneMillion",
    "activePerOneMillion","recoveredPerOneMillion","criticalPerOneMillion"
]

# Conversion timestamp (long)
donnees_sanitaires_api = donnees_sanitaires_api.withColumn("updated", col("updated").cast("long"))

# Conversion INT
for c in cols_int:
   donnees_sanitaires_api = donnees_sanitaires_api.withColumn(c, col(c).cast("int"))

# Conversion DOUBLE
for c in cols_double:
    donnees_sanitaires_api = donnees_sanitaires_api.withColumn(c, col(c).cast("double"))

donnees_sanitaires_api.printSchema()


root
 |-- updated: long (nullable = true)
 |-- country: string (nullable = true)
 |-- countryInfo: string (nullable = true)
 |-- cases: integer (nullable = true)
 |-- todayCases: integer (nullable = true)
 |-- deaths: integer (nullable = true)
 |-- todayDeaths: integer (nullable = true)
 |-- recovered: integer (nullable = true)
 |-- todayRecovered: integer (nullable = true)
 |-- active: integer (nullable = true)
 |-- critical: integer (nullable = true)
 |-- casesPerOneMillion: double (nullable = true)
 |-- deathsPerOneMillion: double (nullable = true)
 |-- tests: integer (nullable = true)
 |-- testsPerOneMillion: double (nullable = true)
 |-- population: integer (nullable = true)
 |-- continent: string (nullable = true)
 |-- oneCasePerPeople: integer (nullable = true)
 |-- oneDeathPerPeople: integer (nullable = true)
 |-- oneTestPerPeople: integer (nullable = true)
 |-- activePerOneMillion: double (nullable = true)
 |-- recoveredPerOneMillion: double (nullable = true)
 |-- criticalPerO

In [61]:
# Supprimer les doublons
donnees_sanitaires_api = donnees_sanitaires_api.dropDuplicates()
donnees_sanitaires_api.show()

+-------------+--------------------+--------------------+--------+----------+------+-----------+---------+--------------+------+--------+------------------+-------------------+--------+------------------+----------+-------------+----------------+-----------------+----------------+-------------------+----------------------+---------------------+
|      updated|             country|         countryInfo|   cases|todayCases|deaths|todayDeaths|recovered|todayRecovered|active|critical|casesPerOneMillion|deathsPerOneMillion|   tests|testsPerOneMillion|population|    continent|oneCasePerPeople|oneDeathPerPeople|oneTestPerPeople|activePerOneMillion|recoveredPerOneMillion|criticalPerOneMillion|
+-------------+--------------------+--------------------+--------+----------+------+-----------+---------+--------------+------+--------+------------------+-------------------+--------+------------------+----------+-------------+----------------+-----------------+----------------+-------------------+-----

In [62]:
donnees_sanitaires_api.count()

231

In [63]:
#Nettoyer les espaces dans toutes les colonnes texte
from pyspark.sql.functions import trim

for col_name in donnees_sanitaires_api.columns:
    donnees_sanitaires_api = donnees_sanitaires_api.withColumn(col_name, trim(col(col_name)))
donnees_sanitaires_api.show()

+-------------+--------------------+--------------------+--------+----------+------+-----------+---------+--------------+------+--------+------------------+-------------------+--------+------------------+----------+-------------+----------------+-----------------+----------------+-------------------+----------------------+---------------------+
|      updated|             country|         countryInfo|   cases|todayCases|deaths|todayDeaths|recovered|todayRecovered|active|critical|casesPerOneMillion|deathsPerOneMillion|   tests|testsPerOneMillion|population|    continent|oneCasePerPeople|oneDeathPerPeople|oneTestPerPeople|activePerOneMillion|recoveredPerOneMillion|criticalPerOneMillion|
+-------------+--------------------+--------------------+--------+----------+------+-----------+---------+--------------+------+--------+------------------+-------------------+--------+------------------+----------+-------------+----------------+-----------------+----------------+-------------------+-----

In [64]:
# Paramètres de connexion
url = "jdbc:postgresql://postgres:5432/post_db_collect"
user = "admin"
password = "admin"
driver = "org.postgresql.Driver" 

In [65]:
# Lire une table existante (exemple : table 'clients')
df_pg = spark.read \
    .format("jdbc") \
    .option("url", url) \
    .option("dbtable", "information_schema.tables") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", driver) \
    .load()

df_pg.show(5)


+---------------+------------+----------------+----------+----------------------------+--------------------+-------------------------+------------------------+----------------------+------------------+--------+-------------+
|  table_catalog|table_schema|      table_name|table_type|self_referencing_column_name|reference_generation|user_defined_type_catalog|user_defined_type_schema|user_defined_type_name|is_insertable_into|is_typed|commit_action|
+---------------+------------+----------------+----------+----------------------------+--------------------+-------------------------+------------------------+----------------------+------------------+--------+-------------+
|post_db_collect|  pg_catalog|    pg_statistic|BASE TABLE|                        NULL|                NULL|                     NULL|                    NULL|                  NULL|               YES|      NO|         NULL|
|post_db_collect|  pg_catalog|         pg_type|BASE TABLE|                        NULL|             

In [67]:
donnees_sanitaires_api.write\
                      .format("jdbc") \
                      .option("url", url) \
                      .option("dbtable", "sante") \
                      .option("user", user) \
                      .option("password", password) \
                      .option("driver", driver).mode("overwrite").save()

25/11/10 12:28:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                