In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkDemo").getOrCreate()

df_test = spark.read.csv('data/test.csv', header=True, sep=';')
df_test.show(5)

+----+------+
| nom|prenom|
+----+------+
|test|  asdf|
|test|  qwer|
|test|  zxcv|
+----+------+



In [4]:
df_test.printSchema()

root
 |-- nom: string (nullable = true)
 |-- prenom: string (nullable = true)



Récupérer les informations de spark

In [6]:
print(spark.version)
print(spark.conf.get("spark.app.name"))
print(spark.sparkContext.master)

3.4.1
SparkDemo
local[*]


In [7]:
df_test.head(3)

[Row(nom='test', prenom='asdf'),
 Row(nom='test', prenom='qwer'),
 Row(nom='test', prenom='zxcv')]

In [8]:
df_test.select(['prenom']).show()

+------+
|prenom|
+------+
|  asdf|
|  qwer|
|  zxcv|
+------+



In [9]:
df_test.dtypes

[('nom', 'string'), ('prenom', 'string')]

In [10]:
df_test.describe().show()

+-------+----+------+
|summary| nom|prenom|
+-------+----+------+
|  count|   3|     3|
|   mean|null|  null|
| stddev|null|  null|
|    min|test|  asdf|
|    max|test|  zxcv|
+-------+----+------+



In [19]:
from pyspark.sql import functions as F

df_test = df_test.withColumn("Fullname",
 F.concat(df_test['prenom'], F.lit(' '), df_test['nom'])
)
df_test.show(5)

+----+------+---------+
| nom|prenom| Fullname|
+----+------+---------+
|test|  asdf|asdf test|
|test|  qwer|qwer test|
|test|  zxcv|zxcv test|
+----+------+---------+



In [25]:
df_persons = spark.read.json('data/persons.json')
df_persons.show(10, truncate=False)

+---------------+----+----------+----+---------+
|_corrupt_record|age |city      |id  |name     |
+---------------+----+----------+----+---------+
|[              |null|null      |null|null     |
|null           |30  |Paris     |1   |Alice    |
|null           |25  |Lyon      |2   |Bob      |
|null           |35  |Marseille |3   |Céline   |
|null           |28  |Paris     |4   |David    |
|null           |40  |Bordeaux  |5   |Emma     |
|null           |22  |Nice      |6   |François |
|null           |31  |Strasbourg|7   |Gabrielle|
|null           |27  |Lille     |8   |Hugo     |
|null           |29  |Nantes    |9   |Inès     |
+---------------+----+----------+----+---------+
only showing top 10 rows



In [26]:
# people_no_corrupt_data
df_persons = spark.read.json('data/people_no_corrupt_data.json')
df_persons.show(10, truncate=False)

+---+----------+---+---------+
|age|city      |id |name     |
+---+----------+---+---------+
|30 |Paris     |1  |Alice    |
|25 |Lyon      |2  |Bob      |
|35 |Marseille |3  |Céline   |
|28 |Paris     |4  |David    |
|40 |Bordeaux  |5  |Emma     |
|22 |Nice      |6  |François |
|31 |Strasbourg|7  |Gabrielle|
|27 |Lille     |8  |Hugo     |
|29 |Nantes    |9  |Inès     |
|33 |Toulouse  |10 |Julien   |
+---+----------+---+---------+
only showing top 10 rows



# Nettoyage des données corrompues

In [30]:
df_persons = spark.read.option('multiline', True).json('data/persons.json')
df_persons.show(10, truncate=False)

+---+----------+---+---------+
|age|city      |id |name     |
+---+----------+---+---------+
|30 |Paris     |1  |Alice    |
|25 |Lyon      |2  |Bob      |
|35 |Marseille |3  |Céline   |
|28 |Paris     |4  |David    |
|40 |Bordeaux  |5  |Emma     |
|22 |Nice      |6  |François |
|31 |Strasbourg|7  |Gabrielle|
|27 |Lille     |8  |Hugo     |
|29 |Nantes    |9  |Inès     |
|33 |Toulouse  |10 |Julien   |
+---+----------+---+---------+
only showing top 10 rows



In [32]:
df_persons = spark.read.json('data/persons.json')

df_persons = df_persons.drop('_corrupt_record')
df_persons = df_persons.dropna(how='all')

df_persons.show(10, truncate=False)

+---+----------+---+---------+
|age|city      |id |name     |
+---+----------+---+---------+
|30 |Paris     |1  |Alice    |
|25 |Lyon      |2  |Bob      |
|35 |Marseille |3  |Céline   |
|28 |Paris     |4  |David    |
|40 |Bordeaux  |5  |Emma     |
|22 |Nice      |6  |François |
|31 |Strasbourg|7  |Gabrielle|
|27 |Lille     |8  |Hugo     |
|29 |Nantes    |9  |Inès     |
|33 |Toulouse  |10 |Julien   |
+---+----------+---+---------+
only showing top 10 rows



# Reordonner les colonnes

In [33]:
df_persons.select('id', 'name', 'age').show(10, truncate=False)

+---+---------+---+
|id |name     |age|
+---+---------+---+
|1  |Alice    |30 |
|2  |Bob      |25 |
|3  |Céline   |35 |
|4  |David    |28 |
|5  |Emma     |40 |
|6  |François |22 |
|7  |Gabrielle|31 |
|8  |Hugo     |27 |
|9  |Inès     |29 |
|10 |Julien   |33 |
+---+---------+---+
only showing top 10 rows



In [35]:
from pyspark.sql import functions as F

df_persons.select(F.max('age')).show(10, truncate=False)

+--------+
|max(age)|
+--------+
|45      |
+--------+



In [38]:
df_persons.groupBy('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 29|    3|
| 26|    3|
| 22|    2|
| 34|    2|
| 32|    2|
| 31|    2|
| 39|    2|
| 25|    2|
| 27|    4|
| 41|    2|
| 28|    3|
| 33|    3|
| 37|    2|
| 35|    2|
| 36|    2|
| 21|    1|
| 38|    2|
| 30|    3|
| 42|    1|
| 23|    2|
+---+-----+
only showing top 20 rows



In [39]:
df_persons.orderBy('age').show(10, truncate=False)

+---+----------+---+--------+
|age|city      |id |name    |
+---+----------+---+--------+
|21 |Bordeaux  |25 |Yasmine |
|22 |Nice      |6  |François|
|22 |Paris     |32 |Fanny   |
|23 |Strasbourg|17 |Quentin |
|23 |Nantes    |39 |Marc    |
|24 |Lyon      |13 |Mathieu |
|24 |Marseille |44 |Salomé  |
|25 |Lyon      |2  |Bob     |
|25 |Bordeaux  |35 |Ismaël  |
|26 |Nantes    |29 |Charles |
+---+----------+---+--------+
only showing top 10 rows

