In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkDemo").getOrCreate()

df_test = spark.read.csv('data/test.csv', header=True, sep=';')
df_test.show(5)

+----+------+
| nom|prenom|
+----+------+
|test|  asdf|
|test|  qwer|
|test|  zxcv|
+----+------+



In [6]:
df_test.printSchema()

root
 |-- nom: string (nullable = true)
 |-- prenom: string (nullable = true)



Récupérer les informations de spark

In [7]:
print(spark.version)
print(spark.conf.get("spark.app.name"))
print(spark.sparkContext.master)

3.4.1
SparkDemo
local[*]


In [8]:
df_test.head(3)

[Row(nom='test', prenom='asdf'),
 Row(nom='test', prenom='qwer'),
 Row(nom='test', prenom='zxcv')]

In [9]:
df_test.select(['prenom']).show()

+------+
|prenom|
+------+
|  asdf|
|  qwer|
|  zxcv|
+------+



In [10]:
df_test.dtypes

[('nom', 'string'), ('prenom', 'string')]

In [11]:
df_test.describe().show()

+-------+----+------+
|summary| nom|prenom|
+-------+----+------+
|  count|   3|     3|
|   mean|null|  null|
| stddev|null|  null|
|    min|test|  asdf|
|    max|test|  zxcv|
+-------+----+------+



In [12]:
from pyspark.sql import functions as F

df_test = df_test.withColumn("Fullname",
 F.concat(df_test['prenom'], F.lit(' '), df_test['nom'])
)
df_test.show(5)

+----+------+---------+
| nom|prenom| Fullname|
+----+------+---------+
|test|  asdf|asdf test|
|test|  qwer|qwer test|
|test|  zxcv|zxcv test|
+----+------+---------+



In [13]:
df_persons = spark.read.json('data/persons.json')
df_persons.show(10, truncate=False)

+---------------+----+----------+----+---------+
|_corrupt_record|age |city      |id  |name     |
+---------------+----+----------+----+---------+
|[              |null|null      |null|null     |
|null           |30  |Paris     |1   |Alice    |
|null           |25  |Lyon      |2   |Bob      |
|null           |35  |Marseille |3   |Céline   |
|null           |28  |Paris     |4   |David    |
|null           |40  |Bordeaux  |5   |Emma     |
|null           |22  |Nice      |6   |François |
|null           |31  |Strasbourg|7   |Gabrielle|
|null           |27  |Lille     |8   |Hugo     |
|null           |29  |Nantes    |9   |Inès     |
+---------------+----+----------+----+---------+
only showing top 10 rows



In [14]:
# people_no_corrupt_data
df_persons = spark.read.json('data/people_no_corrupt_data.json')
df_persons.show(10, truncate=False)

+---+----------+---+---------+
|age|city      |id |name     |
+---+----------+---+---------+
|30 |Paris     |1  |Alice    |
|25 |Lyon      |2  |Bob      |
|35 |Marseille |3  |Céline   |
|28 |Paris     |4  |David    |
|40 |Bordeaux  |5  |Emma     |
|22 |Nice      |6  |François |
|31 |Strasbourg|7  |Gabrielle|
|27 |Lille     |8  |Hugo     |
|29 |Nantes    |9  |Inès     |
|33 |Toulouse  |10 |Julien   |
+---+----------+---+---------+
only showing top 10 rows



# Nettoyage des données corrompues

In [15]:
df_persons = spark.read.option('multiline', True).json('data/persons.json')
df_persons.show(10, truncate=False)

+---+----------+---+---------+
|age|city      |id |name     |
+---+----------+---+---------+
|30 |Paris     |1  |Alice    |
|25 |Lyon      |2  |Bob      |
|35 |Marseille |3  |Céline   |
|28 |Paris     |4  |David    |
|40 |Bordeaux  |5  |Emma     |
|22 |Nice      |6  |François |
|31 |Strasbourg|7  |Gabrielle|
|27 |Lille     |8  |Hugo     |
|29 |Nantes    |9  |Inès     |
|33 |Toulouse  |10 |Julien   |
+---+----------+---+---------+
only showing top 10 rows



In [16]:
df_persons = spark.read.json('data/persons.json')

df_persons = df_persons.drop('_corrupt_record')
df_persons = df_persons.dropna(how='all')

df_persons.show(10, truncate=False)

+---+----------+---+---------+
|age|city      |id |name     |
+---+----------+---+---------+
|30 |Paris     |1  |Alice    |
|25 |Lyon      |2  |Bob      |
|35 |Marseille |3  |Céline   |
|28 |Paris     |4  |David    |
|40 |Bordeaux  |5  |Emma     |
|22 |Nice      |6  |François |
|31 |Strasbourg|7  |Gabrielle|
|27 |Lille     |8  |Hugo     |
|29 |Nantes    |9  |Inès     |
|33 |Toulouse  |10 |Julien   |
+---+----------+---+---------+
only showing top 10 rows



# Reordonner les colonnes

In [17]:
df_persons.select('id', 'name', 'age').show(10, truncate=False)

+---+---------+---+
|id |name     |age|
+---+---------+---+
|1  |Alice    |30 |
|2  |Bob      |25 |
|3  |Céline   |35 |
|4  |David    |28 |
|5  |Emma     |40 |
|6  |François |22 |
|7  |Gabrielle|31 |
|8  |Hugo     |27 |
|9  |Inès     |29 |
|10 |Julien   |33 |
+---+---------+---+
only showing top 10 rows



In [18]:
from pyspark.sql import functions as F

df_persons.select(F.max('age')).show(10, truncate=False)

+--------+
|max(age)|
+--------+
|45      |
+--------+



In [19]:
df_persons.groupBy('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 29|    3|
| 26|    3|
| 22|    2|
| 34|    2|
| 32|    2|
| 31|    2|
| 39|    2|
| 25|    2|
| 27|    4|
| 41|    2|
| 28|    3|
| 33|    3|
| 37|    2|
| 35|    2|
| 36|    2|
| 21|    1|
| 38|    2|
| 30|    3|
| 42|    1|
| 23|    2|
+---+-----+
only showing top 20 rows



In [20]:
df_persons.orderBy('age').show(10, truncate=False)

+---+----------+---+--------+
|age|city      |id |name    |
+---+----------+---+--------+
|21 |Bordeaux  |25 |Yasmine |
|22 |Nice      |6  |François|
|22 |Paris     |32 |Fanny   |
|23 |Strasbourg|17 |Quentin |
|23 |Nantes    |39 |Marc    |
|24 |Lyon      |13 |Mathieu |
|24 |Marseille |44 |Salomé  |
|25 |Lyon      |2  |Bob     |
|25 |Bordeaux  |35 |Ismaël  |
|26 |Nantes    |29 |Charles |
+---+----------+---+--------+
only showing top 10 rows



In [21]:
from pyspark.sql import functions as F

pers = df_persons.filter(F.col('name').rlike('^[Aa]'))

pers.show()

+---+----------+---+------+
|age|      city| id|  name|
+---+----------+---+------+
| 30|     Paris|  1| Alice|
| 33|Strasbourg| 27|Adrien|
+---+----------+---+------+



# Jointures

In [1]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkDemo").getOrCreate()

admirals = spark.read.json("data/admirals.json").drop(F.col('_corrupt_record')).dropna()
battles = spark.read.json("data/napoleonic_battles.json").drop(F.col('_corrupt_record')).dropna()

admirals.show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/08 11:56:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/08 11:56:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+--------------------+--------------------+---+-----------+
|             admiral|             battles| id|nationality|
+--------------------+--------------------+---+-----------+
|      Horatio Nelson|[Battle of Cape S...|  1|    British|
|Pierre-Charles Vi...|[Battle of the Ni...|  2|     French|
|    Federico Gravina|[Battle of Trafal...|  3|    Spanish|
|Cuthbert Collingwood|[Battle of Trafal...|  4|    British|
|   Thomas Troubridge|[Battle of Cape S...|  5|    British|
|Louis-René Levass...|[Battle of Cape N...|  6|     French|
|      James Saumarez|[Battle of Algeci...|  7|    British|
|       Edward Pellew|[Battle of Basque...|  8|    British|
|         Samuel Hood|[Battle of Toulon...|  9|    British|
|Pierre Jean Vanst...|[Battle of Camper...| 10|     French|
|         John Jervis|[Battle of Cape S...| 11|    British|
|         Adam Duncan|[Battle of Camper...| 12|    British|
|Jean-Baptiste Perrée|[Battle of the Ni...| 13|     French|
|     Thomas Cochrane|[Battle of Basque.

In [2]:
battles.show()

+--------------------+---+--------------------+--------------------+----------+--------------+----+
|        belligerents| id|            location|                name|   outcome|        winner|year|
+--------------------+---+--------------------+--------------------+----------+--------------+----+
|France vs Britain...|  1|      Toulon, France|    Battle of Toulon|   Victory|        France|1793|
|   Britain vs France|  2|      Atlantic Ocean|Glorious First of...|   Victory|       Britain|1794|
|   Britain vs France|  3|       Bay of Biscay|     Battle of Groix|   Victory|       Britain|1795|
|    Britain vs Spain|  4|        off Portugal|Battle of Cape St...|   Victory|       Britain|1797|
|Britain vs Nether...|  5|     off Netherlands|Battle of Camperdown|   Victory|       Britain|1797|
|   Britain vs France|  6|   Nile Delta, Egypt|  Battle of the Nile|   Victory|       Britain|1798|
|   Britain vs France|  7|         off Ireland|Battle of Tory Is...|   Victory|       Britain|1798|


In [6]:
admirals_exp = admirals.withColumn("battle", F.explode(F.col("battles")))
admirals_exp = admirals_exp.withColumnRenamed("id","admiral_id")

admirals_exp.show()

+--------------------+--------------------+----------+-----------+--------------------+
|             admiral|             battles|admiral_id|nationality|              battle|
+--------------------+--------------------+----------+-----------+--------------------+
|      Horatio Nelson|[Battle of Cape S...|         1|    British|Battle of Cape St...|
|      Horatio Nelson|[Battle of Cape S...|         1|    British|  Battle of the Nile|
|      Horatio Nelson|[Battle of Cape S...|         1|    British|Battle of Copenhagen|
|      Horatio Nelson|[Battle of Cape S...|         1|    British| Battle of Trafalgar|
|Pierre-Charles Vi...|[Battle of the Ni...|         2|     French|  Battle of the Nile|
|Pierre-Charles Vi...|[Battle of the Ni...|         2|     French| Battle of Trafalgar|
|    Federico Gravina|[Battle of Trafal...|         3|    Spanish| Battle of Trafalgar|
|Cuthbert Collingwood|[Battle of Trafal...|         4|    British| Battle of Trafalgar|
|Cuthbert Collingwood|[Battle of

In [7]:
joined = battles.join(admirals_exp, battles.name == admirals_exp.battle, "inner")

joined.show()

+--------------------+---+-----------------+--------------------+-------+------------+----+--------------------+--------------------+----------+-----------+--------------------+
|        belligerents| id|         location|                name|outcome|      winner|year|             admiral|             battles|admiral_id|nationality|              battle|
+--------------------+---+-----------------+--------------------+-------+------------+----+--------------------+--------------------+----------+-----------+--------------------+
|France vs Britain...|  1|   Toulon, France|    Battle of Toulon|Victory|      France|1793|William Sidney Smith|[Battle of Toulon...|        19|    British|    Battle of Toulon|
|France vs Britain...|  1|   Toulon, France|    Battle of Toulon|Victory|      France|1793|         Samuel Hood|[Battle of Toulon...|         9|    British|    Battle of Toulon|
|   Britain vs France|  2|   Atlantic Ocean|Glorious First of...|Victory|     Britain|1794|Jean-François Ren..

In [8]:
joined.write.format("parquet").save("hdfs://namenode:9000/user/spark/joined_admirals_battles")

                                                                                