In [1]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

In [2]:
spark = SparkSession.builder.appName("df_missing_data").getOrCreate()

In [3]:
df = spark.read.csv(
    "file:///home/jovyan/work/sample/null_data.csv", header=True, inferSchema=True)

In [4]:
df.show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|      null|  null|
|3000|      null| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [5]:
df.na.drop(how="any").show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [7]:
df.na.drop(thresh=2).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|3000|      null| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [9]:
df.na.drop(subset=['salary']).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|3000|      null| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [10]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- occupation: string (nullable = true)
 |-- salary: integer (nullable = true)



In [11]:
df.na.fill("engineer").show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|  engineer|  null|
|3000|  engineer| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [12]:
df.na.fill(0).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|      null|     0|
|3000|      null| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [13]:
df.na.fill("NA", subset=["occupation"]).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|        NA|  null|
|3000|        NA| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [23]:
mean_value = df.select(f.mean(df['salary'])).collect()
print(mean_value[0][0])

87500.0


In [24]:
df.na.fill(mean_value[0][0], subset=["salary"]).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|      null| 87500|
|3000|      null| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [25]:
spark = SparkSession.builder.appName("df_manage_date").getOrCreate()
df = spark.read.csv(
    "file:///home/jovyan/work/sample/date_parsing.csv", header=True, inferSchema=True)

In [26]:
df.show()

+----------+------+
|      date|number|
+----------+------+
|2022-01-27|  2723|
|2021-12-29|  1460|
|2022-01-22|  3411|
|2022-01-06|  1527|
|2022-04-21|  3978|
|2022-10-23|  3443|
|2021-12-23|  1641|
|2022-05-31|  1633|
|2021-12-29|  1072|
|2021-12-30|  2936|
|2022-05-04|  2494|
|2022-06-22|  2019|
|2022-04-23|  3804|
|2022-08-04|  1619|
|2022-01-26|  1306|
|2022-09-23|  3918|
|2022-05-27|  3209|
|2022-09-20|  2333|
|2022-07-05|  1861|
|2022-07-18|  3404|
+----------+------+
only showing top 20 rows



In [28]:
df.select(f.year('date')).show()

+----------+
|year(date)|
+----------+
|      2022|
|      2021|
|      2022|
|      2022|
|      2022|
|      2022|
|      2021|
|      2022|
|      2021|
|      2021|
|      2022|
|      2022|
|      2022|
|      2022|
|      2022|
|      2022|
|      2022|
|      2022|
|      2022|
|      2022|
+----------+
only showing top 20 rows



In [30]:
df.select(f.month('date')).show()

+-----------+
|month(date)|
+-----------+
|          1|
|         12|
|          1|
|          1|
|          4|
|         10|
|         12|
|          5|
|         12|
|         12|
|          5|
|          6|
|          4|
|          8|
|          1|
|          9|
|          5|
|          9|
|          7|
|          7|
+-----------+
only showing top 20 rows



In [36]:
df.select(f.dayofmonth('date').alias('day')).show()

+---+
|day|
+---+
| 27|
| 29|
| 22|
|  6|
| 21|
| 23|
| 23|
| 31|
| 29|
| 30|
|  4|
| 22|
| 23|
|  4|
| 26|
| 23|
| 27|
| 20|
|  5|
| 18|
+---+
only showing top 20 rows



In [37]:
df.select(f.dayofyear('date').alias('day')).show()

+---+
|day|
+---+
| 27|
|363|
| 22|
|  6|
|111|
|296|
|357|
|151|
|363|
|364|
|124|
|173|
|113|
|216|
| 26|
|266|
|147|
|263|
|186|
|199|
+---+
only showing top 20 rows



In [38]:
df.withColumn("year", f.year('date')).show()

+----------+------+----+
|      date|number|year|
+----------+------+----+
|2022-01-27|  2723|2022|
|2021-12-29|  1460|2021|
|2022-01-22|  3411|2022|
|2022-01-06|  1527|2022|
|2022-04-21|  3978|2022|
|2022-10-23|  3443|2022|
|2021-12-23|  1641|2021|
|2022-05-31|  1633|2022|
|2021-12-29|  1072|2021|
|2021-12-30|  2936|2021|
|2022-05-04|  2494|2022|
|2022-06-22|  2019|2022|
|2022-04-23|  3804|2022|
|2022-08-04|  1619|2022|
|2022-01-26|  1306|2022|
|2022-09-23|  3918|2022|
|2022-05-27|  3209|2022|
|2022-09-20|  2333|2022|
|2022-07-05|  1861|2022|
|2022-07-18|  3404|2022|
+----------+------+----+
only showing top 20 rows



In [39]:
df.withColumn("year", f.year('date')).groupBy("year").mean("number").show()

+----+------------------+
|year|       avg(number)|
+----+------------------+
|2022|2540.6652806652805|
|2021| 2195.684210526316|
+----+------------------+



In [43]:
df = df.withColumn("year", f.year('date')).groupBy("year")\
    .mean("number") \
    .withColumnRenamed("avg(number)","avg")

In [44]:
df.show()

+----+------------------+
|year|               avg|
+----+------------------+
|2022|2540.6652806652805|
|2021| 2195.684210526316|
+----+------------------+



In [46]:
df.select('year', f.format_number("avg",2).alias("avg")).show()

+----+--------+
|year|     avg|
+----+--------+
|2022|2,540.67|
|2021|2,195.68|
+----+--------+

