In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

In [3]:
# sc = SparkContext().getOrCreate()

In [4]:
spark = SparkSession.builder.appName('Basicspyspark').getOrCreate()

In [None]:
df = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"])

In [None]:
df.count()

In [None]:
df.show()

In [None]:
# Returns Column Object
df.A

In [10]:
df.select('A').show()

+---+
|  A|
+---+
|  1|
|  2|
|  3|
+---+



In [11]:
# Adding a new column
df.withColumn('C', df.A+1).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1|  4|  2|
|  2|  5|  3|
|  3|  6|  4|
+---+---+---+



In [12]:
from pyspark.sql.functions import lit
df.withColumn('C', lit(5)).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1|  4|  5|
|  2|  5|  5|
|  3|  6|  5|
+---+---+---+



In [13]:
df = df.withColumn('C', lit(5))

In [14]:
df.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1|  4|  5|
|  2|  5|  5|
|  3|  6|  5|
+---+---+---+



In [15]:
df.select('A', (df.A>2).alias('State')).show()

+---+-----+
|  A|State|
+---+-----+
|  1|false|
|  2|false|
|  3| true|
+---+-----+



In [16]:
df[(df.A>2)].show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  3|  6|  5|
+---+---+---+



### GroupBy

In [17]:
df = spark.createDataFrame([('a', 33), ('b', 11), ('a', 22)], ['names', 'age'])

In [18]:
gdf = df.groupBy(df.names)

In [19]:
gdf.agg({"*":"count"}).collect()

[Row(names='b', count(1)=1), Row(names='a', count(1)=2)]

In [20]:
from pyspark.sql import functions as F

In [21]:
df = spark.createDataFrame([('a', 33), ('b', 11), ('a', 22)], ['names', 'age'])
gdf = df.groupBy(df.names)

sorted(gdf.agg(F.min(df.age)).collect())

[Row(names='a', min(age)=22), Row(names='b', min(age)=11)]

In [22]:
g2df = df.groupBy(df.names)
g2df.min('age').collect()

[Row(names='b', min(age)=11), Row(names='a', min(age)=22)]

In [35]:
# spark.stop()

### Generate your own DataFrame
Create stringRDD RDD and then convert it into a DataFrame when we're reading stringJSONRDD using spark.read.json.

In [24]:
# Generate our own JSON data 
stringJSONRDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Katie",
    "age": 19,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Michael",
    "age": 22,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Simone",
    "age": 23,
    "eyeColor": "blue"
  }""")
)

In [25]:
# Create DataFrame
swimmersJSON = spark.read.json(stringJSONRDD)

In [27]:
# Create temporary table
swimmersJSON.createOrReplaceTempView("swimmersJSON")

In [28]:
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



In [29]:
# SQL Query
spark.sql("select * from swimmersJSON").collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='345', name='Simone')]

In [30]:
spark.sql("select * from swimmersJSON")

DataFrame[age: bigint, eyeColor: string, id: string, name: string]

### Inferring the Schema Using Reflection
Note that Apache Spark is inferring the schema using reflection; i.e. it automaticlaly determines the schema of the data based on reviewing the JSON data.


In [31]:
# Print the scheme
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



Notice that Spark was able to determine infer the schema (when reviewing the schema using .printSchema).

But what if we want to programmatically specify the schema?

#### Programmatically Specifying the Schema

In this case, let's specify the schema for a CSV text file.

In [33]:
from pyspark.sql.types import *

stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 23, 'blue')])

In [35]:
# The schema is encoded in a string, using StructType we define the schema using various pyspark.sql.types

schemaString = "id name age eyeColor"
schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

In [36]:
# Apply the schema to the RDD and Create DataFrame
swimmers = spark.createDataFrame(stringCSVRDD, schema)

In [38]:
# Create a temporary view using the DataFrame
swimmers.createOrReplaceTempView("swimmers")

In [39]:
# Print the schema
#   Notice that we have redefined id as Long (instead of String)
swimmers.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)



In [40]:
spark.sql("select * from swimmers")

DataFrame[id: bigint, name: string, age: bigint, eyeColor: string]

As you can see from above, we can programmatically apply the schema instead of allowing the Spark engine to infer the schema via reflection.

Additional Resources include:

- [PySpark API Reference](https://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html)
- [Spark SQL, DataFrames, and Datasets Guide](https://spark.apache.org/docs/latest/sql-programming-guide.html#programmatically-specifying-the-schema): This is in reference to Programmatically Specifying the Schema using a CSV file.

### Querying with dataframe

In [41]:
# Query id and age for swimmers with age = 22 via DataFrame API
swimmers.select("id", "age").filter("age = 22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [43]:
# Query id and age for swimmers with age = 22 via DataFrame API in another ways
swimmers.select(swimmers.id, swimmers.age).filter(swimmers.age == 22).show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [45]:
# Query id and age for swimmer with age = 22 in SQL
spark.sql("select id, age \
          from swimmers \
          where age =22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [47]:
spark.sql("select id, age \
           from swimmers \
           where age = 22")

DataFrame[id: bigint, age: bigint]

In [48]:
# Query name and eye color for swimmer with eye color starting with the letter 'b'
spark.sql("select name, eyeColor \
            from swimmers \
            where eyeColor \
            like 'b%'").show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



In [50]:
spark.sql("select name, eyeColor\
            from swimmers \
            where eyeColor like 'b%'")

DataFrame[name: string, eyeColor: string]

### Querying with the DataFrame API
With DataFrames, you can start writing your queries using the DataFrame API

In [51]:
# Show the values
swimmers.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [52]:
# Get count of rows
swimmers.count()

3

In [53]:
# Get the id, age where age = 22
swimmers.select("id", "age").filter("age = 22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [54]:
# Get the name, eyeColor where eyeColor like 'b%'
swimmers.select("name", "eyeColor").filter("eyeColor like 'b%'").show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



### DataFrame Queries
- Understanding explode, selectExpr

In [86]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [88]:


# Create the Departments
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')

# Create the employees
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee11 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 200000)
employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)
employee31 = Employee('matei', None, 'no-reply@waterloo.edu', 180000)
employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)


# Create the DepartmentWithEmployees instances from Departments and Employees
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4, employee11])
departmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4, employee31])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

In [90]:
departmentsWithEmployeesSeq1= [departmentWithEmployees1, departmentWithEmployees2]
df1 = sqlContext.createDataFrame(departmentsWithEmployeesSeq1)

departmentsWithEmployeesSeq2 = [departmentWithEmployees3, departmentWithEmployees4]
df2 = sqlContext.createDataFrame(departmentsWithEmployeesSeq2)

In [92]:
df1.show()

+--------------------+--------------------+
|          department|           employees|
+--------------------+--------------------+
|[123456, Computer...|[[michael, armbru...|
|[789012, Mechanic...|[[matei,, no-repl...|
+--------------------+--------------------+



In [93]:
df2.show()

+--------------------+--------------------+
|          department|           employees|
+--------------------+--------------------+
|[345678, Theater ...|[[michael, armbru...|
|[901234, Indoor R...|[[xiangrui, meng,...|
+--------------------+--------------------+



In [95]:
unionDF = df1.unionAll(df2)

In [96]:
unionDF.show()

+--------------------+--------------------+
|          department|           employees|
+--------------------+--------------------+
|[123456, Computer...|[[michael, armbru...|
|[789012, Mechanic...|[[matei,, no-repl...|
|[345678, Theater ...|[[michael, armbru...|
|[901234, Indoor R...|[[xiangrui, meng,...|
+--------------------+--------------------+



In [97]:
from pyspark.sql.functions import explode

df = unionDF.select("department", explode("employees").alias("e"))

In [98]:
df.show()

+--------------------+--------------------+
|          department|                   e|
+--------------------+--------------------+
|[123456, Computer...|[michael, armbrus...|
|[123456, Computer...|[xiangrui, meng, ...|
|[789012, Mechanic...|[matei,, no-reply...|
|[789012, Mechanic...|[, wendell, no-re...|
|[789012, Mechanic...|[michael, armbrus...|
|[345678, Theater ...|[michael, armbrus...|
|[345678, Theater ...|[, wendell, no-re...|
|[345678, Theater ...|[matei,, no-reply...|
|[901234, Indoor R...|[xiangrui, meng, ...|
|[901234, Indoor R...|[matei,, no-reply...|
+--------------------+--------------------+



In [99]:
df.collect()

[Row(department=Row(id='123456', name='Computer Science'), e=Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000)),
 Row(department=Row(id='123456', name='Computer Science'), e=Row(firstName='xiangrui', lastName='meng', email='no-reply@stanford.edu', salary=120000)),
 Row(department=Row(id='789012', name='Mechanical Engineering'), e=Row(firstName='matei', lastName=None, email='no-reply@waterloo.edu', salary=140000)),
 Row(department=Row(id='789012', name='Mechanical Engineering'), e=Row(firstName=None, lastName='wendell', email='no-reply@berkeley.edu', salary=160000)),
 Row(department=Row(id='789012', name='Mechanical Engineering'), e=Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=200000)),
 Row(department=Row(id='345678', name='Theater and Drama'), e=Row(firstName='michael', lastName='armbrust', email='no-reply@berkeley.edu', salary=100000)),
 Row(department=Row(id='345678', name='Theater and Drama'), e=Row(fi

In [100]:
df.selectExpr("department.id", "department.name", "e.firstName", "e.lastName", "e.email", "e.salary").show()

+------+--------------------+---------+--------+--------------------+------+
|    id|                name|firstName|lastName|               email|salary|
+------+--------------------+---------+--------+--------------------+------+
|123456|    Computer Science|  michael|armbrust|no-reply@berkeley...|100000|
|123456|    Computer Science| xiangrui|    meng|no-reply@stanford...|120000|
|789012|Mechanical Engine...|    matei|    null|no-reply@waterloo...|140000|
|789012|Mechanical Engine...|     null| wendell|no-reply@berkeley...|160000|
|789012|Mechanical Engine...|  michael|armbrust|no-reply@berkeley...|200000|
|345678|   Theater and Drama|  michael|armbrust|no-reply@berkeley...|100000|
|345678|   Theater and Drama|     null| wendell|no-reply@berkeley...|160000|
|345678|   Theater and Drama|    matei|    null|no-reply@waterloo...|180000|
|901234|   Indoor Recreation| xiangrui|    meng|no-reply@stanford...|120000|
|901234|   Indoor Recreation|    matei|    null|no-reply@waterloo...|140000|

In [101]:
explodeDF = df.selectExpr("e.firstName", "e.lastName", "e.email", "e.salary")
explodeDF.show()

+---------+--------+--------------------+------+
|firstName|lastName|               email|salary|
+---------+--------+--------------------+------+
|  michael|armbrust|no-reply@berkeley...|100000|
| xiangrui|    meng|no-reply@stanford...|120000|
|    matei|    null|no-reply@waterloo...|140000|
|     null| wendell|no-reply@berkeley...|160000|
|  michael|armbrust|no-reply@berkeley...|200000|
|  michael|armbrust|no-reply@berkeley...|100000|
|     null| wendell|no-reply@berkeley...|160000|
|    matei|    null|no-reply@waterloo...|180000|
| xiangrui|    meng|no-reply@stanford...|120000|
|    matei|    null|no-reply@waterloo...|140000|
+---------+--------+--------------------+------+



In [102]:
filterDF = explodeDF.filter(explodeDF.firstName == 'xiangrui').sort(explodeDF.salary)

In [103]:
filterDF.show()

+---------+--------+--------------------+------+
|firstName|lastName|               email|salary|
+---------+--------+--------------------+------+
| xiangrui|    meng|no-reply@stanford...|120000|
| xiangrui|    meng|no-reply@stanford...|120000|
+---------+--------+--------------------+------+



In [105]:
from pyspark.sql.functions import col, asc, desc

In [106]:
filterDF = explodeDF.filter((col("firstName") == "xiangrui") | (col("firstName") == "michael")).sort(desc("lastName"))

In [107]:
filterDF.show()

+---------+--------+--------------------+------+
|firstName|lastName|               email|salary|
+---------+--------+--------------------+------+
| xiangrui|    meng|no-reply@stanford...|120000|
| xiangrui|    meng|no-reply@stanford...|120000|
|  michael|armbrust|no-reply@berkeley...|100000|
|  michael|armbrust|no-reply@berkeley...|100000|
|  michael|armbrust|no-reply@berkeley...|200000|
+---------+--------+--------------------+------+



In [108]:
filterDF = explodeDF.filter((col("firstName") == "xiangrui") | (col("firstName") == "michael")).sort(asc("lastName"))

In [109]:
filterDF.show()

+---------+--------+--------------------+------+
|firstName|lastName|               email|salary|
+---------+--------+--------------------+------+
|  michael|armbrust|no-reply@berkeley...|200000|
|  michael|armbrust|no-reply@berkeley...|100000|
|  michael|armbrust|no-reply@berkeley...|100000|
| xiangrui|    meng|no-reply@stanford...|120000|
| xiangrui|    meng|no-reply@stanford...|120000|
+---------+--------+--------------------+------+



### Handling Missing Data

In [110]:
from pyspark.sql.functions import col, asc, desc
filterNonNullDF = explodeDF.filter(col("firstName").isNotNull()).filter(col("lastName").isNotNull()).sort("email")

In [111]:
filterNonNullDF.show()

+---------+--------+--------------------+------+
|firstName|lastName|               email|salary|
+---------+--------+--------------------+------+
|  michael|armbrust|no-reply@berkeley...|200000|
|  michael|armbrust|no-reply@berkeley...|100000|
|  michael|armbrust|no-reply@berkeley...|100000|
| xiangrui|    meng|no-reply@stanford...|120000|
| xiangrui|    meng|no-reply@stanford...|120000|
+---------+--------+--------------------+------+



In [112]:
from pyspark.sql.functions import countDistinct, count

countDistinctDF = explodeDF.select("firstName", "lastName") \
                            .groupBy("firstName", "lastName") \
                            .agg(countDistinct("firstName"))

countDistinctDF.show()

+---------+--------+----------------+
|firstName|lastName|count(firstName)|
+---------+--------+----------------+
|     null| wendell|               0|
|    matei|    null|               1|
| xiangrui|    meng|               1|
|  michael|armbrust|               1|
+---------+--------+----------------+



In [113]:
countDistinctDF = explodeDF.select("firstName", "lastName") \
                            .groupBy("firstName", "lastName") \
                            .agg(count("*"))

countDistinctDF.show()

+---------+--------+--------+
|firstName|lastName|count(1)|
+---------+--------+--------+
|     null| wendell|       2|
| xiangrui|    meng|       2|
|  michael|armbrust|       3|
|    matei|    null|       3|
+---------+--------+--------+



In [115]:
explodeDF.describe("salary").show()

+-------+----------------+
|summary|          salary|
+-------+----------------+
|  count|              10|
|   mean|        142000.0|
| stddev|33266.5998663324|
|    min|          100000|
|    max|          200000|
+-------+----------------+



For more information, please refer to:

- [Spark SQL, DataFrames and Datasets Guide](http://spark.apache.org/docs/latest/sql-programming-guide.html#sql)
- PySpark SQL Module: DataFrame
- PySpark SQL Functions Module

### DropDuplicates

In [116]:
df = spark.createDataFrame([
        (1, 144.5, 5.9, 33, 'M'),
        (2, 167.2, 5.4, 45, 'M'),
        (3, 124.1, 5.2, 23, 'F'),
        (4, 144.5, 5.9, 33, 'M'),
        (5, 133.2, 5.7, 54, 'F'),
        (3, 124.1, 5.2, 23, 'F'),
        (5, 129.2, 5.3, 42, 'M'),
    ], ['id', 'weight', 'height', 'age', 'gender'])
df.show()

+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  1| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
|  4| 144.5|   5.9| 33|     M|
|  5| 133.2|   5.7| 54|     F|
|  3| 124.1|   5.2| 23|     F|
|  5| 129.2|   5.3| 42|     M|
+---+------+------+---+------+



In [117]:
df = df.dropDuplicates()

In [118]:
df.show()

+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  5| 133.2|   5.7| 54|     F|
|  5| 129.2|   5.3| 42|     M|
|  1| 144.5|   5.9| 33|     M|
|  4| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
+---+------+------+---+------+



In [119]:
df.count()

6

In [121]:
# Duplicates except for id column
df = df.dropDuplicates(subset=[c for c in df.columns if c != 'id'])

In [122]:
df.show()

+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
|  5| 133.2|   5.7| 54|     F|
|  1| 144.5|   5.9| 33|     M|
|  2| 167.2|   5.4| 45|     M|
|  3| 124.1|   5.2| 23|     F|
|  5| 129.2|   5.3| 42|     M|
+---+------+------+---+------+



In [123]:
[c for c in df.columns if c != 'id']

['weight', 'height', 'age', 'gender']

### Aggregation

In [124]:
import pyspark.sql.functions as F
df.agg(
    F.count('id').alias('count'),
    F.countDistinct('id').alias('distinct')
).show()

+-----+--------+
|count|distinct|
+-----+--------+
|    5|       4|
+-----+--------+



### More on Handling Missing Data

In [125]:
df_miss = spark.createDataFrame([
        (1, 143.5, 5.6, 28,   'M',  100000),
        (2, 167.2, 5.4, 45,   'M',  None),
        (3, None , 5.2, None, None, None),
        (4, 144.5, 5.9, 33,   'M',  None),
        (5, 133.2, 5.7, 54,   'F',  None),
        (6, 124.1, 5.2, None, 'F',  None),
        (7, 129.2, 5.3, 42,   'M',  76000),
    ], ['id', 'weight', 'height', 'age', 'gender', 'income'])

In [126]:
df_miss.show()

+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  3|  null|   5.2|null|  null|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
|  7| 129.2|   5.3|  42|     M| 76000|
+---+------+------+----+------+------+



In [127]:
df_miss.printSchema()

root
 |-- id: long (nullable = true)
 |-- weight: double (nullable = true)
 |-- height: double (nullable = true)
 |-- age: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- income: long (nullable = true)



In [128]:
df_miss.describe()

DataFrame[summary: string, id: string, weight: string, height: string, age: string, gender: string, income: string]

In [129]:
# Calculate missing columns for each row
df_miss.rdd.collect()

[Row(id=1, weight=143.5, height=5.6, age=28, gender='M', income=100000),
 Row(id=2, weight=167.2, height=5.4, age=45, gender='M', income=None),
 Row(id=3, weight=None, height=5.2, age=None, gender=None, income=None),
 Row(id=4, weight=144.5, height=5.9, age=33, gender='M', income=None),
 Row(id=5, weight=133.2, height=5.7, age=54, gender='F', income=None),
 Row(id=6, weight=124.1, height=5.2, age=None, gender='F', income=None),
 Row(id=7, weight=129.2, height=5.3, age=42, gender='M', income=76000)]

In [130]:
df_miss.rdd.map(lambda r: (r['id'], sum([c == None for c in r]))).collect()

[(1, 0), (2, 1), (3, 4), (4, 1), (5, 1), (6, 2), (7, 0)]

In [131]:
df_miss.where('id == 3').show()

+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  3|  null|   5.2|null|  null|  null|
+---+------+------+----+------+------+



In [132]:
import pyspark.sql.functions as F

In [133]:
df_miss.agg(
    F.count('weight'), F.count('height'), F.count('age'), F.count('gender'), F.count('income'),
    F.count('*')
).show()

+-------------+-------------+----------+-------------+-------------+--------+
|count(weight)|count(height)|count(age)|count(gender)|count(income)|count(1)|
+-------------+-------------+----------+-------------+-------------+--------+
|            6|            7|         5|            6|            2|       7|
+-------------+-------------+----------+-------------+-------------+--------+



In [138]:
df_miss.agg(
    *[F.count(c) for c in df.columns]
).show()

+---------+-------------+-------------+----------+-------------+
|count(id)|count(weight)|count(height)|count(age)|count(gender)|
+---------+-------------+-------------+----------+-------------+
|        7|            6|            7|         5|            6|
+---------+-------------+-------------+----------+-------------+



In [147]:
df_miss.dropna(thresh=3).show()

+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
|  1| 143.5|   5.6|  28|     M|100000|
|  2| 167.2|   5.4|  45|     M|  null|
|  4| 144.5|   5.9|  33|     M|  null|
|  5| 133.2|   5.7|  54|     F|  null|
|  6| 124.1|   5.2|null|     F|  null|
|  7| 129.2|   5.3|  42|     M| 76000|
+---+------+------+----+------+------+

