### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Filtrar columnas

In [None]:
from pyspark.sql.functions import lit, concat, col

employee_data = [(10,"Raj","Kumar","1999","100","M",2000),
                 (20,"Rahul","Rajan","2002","200","F",3000),
                 (30,"Raghav","Manish","2010","100",None,5000),
                 (40,"Raja","Singh","2004","100","F",1000),
                 (50,"Rama","Krish","2008","400","M",8000),
                 (60,"Rasul","Kutty","2014","500","M",7000),
                 (70,"Kumar","Chand","2004","600","M",7000)
                ]
employee_schema = ["employee_id","first_name","last_name","doj",
                   "employee_dept_id","gender","salary"]

df = spark.createDataFrame(data=employee_data, schema=employee_schema)
df.printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- doj: string (nullable = true)
 |-- employee_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [None]:
df.show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|10         |Raj       |Kumar    |1999|100             |M     |2000  |
|20         |Rahul     |Rajan    |2002|200             |F     |3000  |
|30         |Raghav    |Manish   |2010|100             |null  |5000  |
|40         |Raja      |Singh    |2004|100             |F     |1000  |
|50         |Rama      |Krish    |2008|400             |M     |8000  |
|60         |Rasul     |Kutty    |2014|500             |M     |7000  |
|70         |Kumar     |Chand    |2004|600             |M     |7000  |
+-----------+----------+---------+----+----------------+------+------+



#### Forma 1 (Errónea)

In [None]:
df_modif = df.filter('salary' >= 1000)

df_modif.show(truncate=False)

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-804623631765907>:1[0m
[0;32m----> 1[0m df_modif [38;5;241m=[39m df[38;5;241m.[39mfilter([38;5;124m'[39m[38;5;124msalary[39m[38;5;124m'[39m [38;5;241m>[39m[38;5;241m=[39m [38;5;241m1000[39m)
[1;32m      3[0m df_modif[38;5;241m.[39mshow(truncate[38;5;241m=[39m[38;5;28;01mFalse[39;00m)

[0;31mTypeError[0m: '>=' not supported between instances of 'str' and 'int'

#### Forma 2

In [None]:
df_modif = df.filter('salary >= 5000')
#df_modif = df.filter('salary = 5000')

df_modif.show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|30         |Raghav    |Manish   |2010|100             |null  |5000  |
|50         |Rama      |Krish    |2008|400             |M     |8000  |
|60         |Rasul     |Kutty    |2014|500             |M     |7000  |
|70         |Kumar     |Chand    |2004|600             |M     |7000  |
+-----------+----------+---------+----+----------------+------+------+



#### Forma 3

In [None]:
df_modif = df.filter(col('salary') >= 5000)

df_modif.show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|30         |Raghav    |Manish   |2010|100             |null  |5000  |
|50         |Rama      |Krish    |2008|400             |M     |8000  |
|60         |Rasul     |Kutty    |2014|500             |M     |7000  |
|70         |Kumar     |Chand    |2004|600             |M     |7000  |
+-----------+----------+---------+----+----------------+------+------+



#### Forma 4

In [None]:
df_modif = df.filter(df.salary >= 5000)

df_modif.show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|30         |Raghav    |Manish   |2010|100             |null  |5000  |
|50         |Rama      |Krish    |2008|400             |M     |8000  |
|60         |Rasul     |Kutty    |2014|500             |M     |7000  |
|70         |Kumar     |Chand    |2004|600             |M     |7000  |
+-----------+----------+---------+----+----------------+------+------+



#### Forma 5

In [None]:
df_modif = df.filter(df["salary"] >= 5000)

df_modif.show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|30         |Raghav    |Manish   |2010|100             |null  |5000  |
|50         |Rama      |Krish    |2008|400             |M     |8000  |
|60         |Rasul     |Kutty    |2014|500             |M     |7000  |
|70         |Kumar     |Chand    |2004|600             |M     |7000  |
+-----------+----------+---------+----+----------------+------+------+



#### Forma 6

In [None]:
df_modif = df.where(df["salary"] >= 5000)

df_modif.show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|30         |Raghav    |Manish   |2010|100             |null  |5000  |
|50         |Rama      |Krish    |2008|400             |M     |8000  |
|60         |Rasul     |Kutty    |2014|500             |M     |7000  |
|70         |Kumar     |Chand    |2004|600             |M     |7000  |
+-----------+----------+---------+----+----------------+------+------+



#### Ejemplos varios

In [None]:
display(df.filter(df.salary == 3000)) 

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
20,Rahul,Rajan,2002,200,F,3000


In [None]:
df.filter(col('gender') == 'F').show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|20         |Rahul     |Rajan    |2002|200             |F     |3000  |
|40         |Raja      |Singh    |2004|100             |F     |1000  |
+-----------+----------+---------+----+----------------+------+------+



In [None]:
df.filter((col('gender') == 'F') & (col('doj') == 2004)).show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|40         |Raja      |Singh    |2004|100             |F     |1000  |
+-----------+----------+---------+----+----------------+------+------+



In [None]:
display(df.filter((df.gender == 'F') & (df.doj == 2004)))      

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
40,Raja,Singh,2004,100,F,1000


In [None]:
df.filter("gender == 'F' and doj == 2004").show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|40         |Raja      |Singh    |2004|100             |F     |1000  |
+-----------+----------+---------+----+----------------+------+------+



In [None]:
df.where((df["gender"] == 'F') & (df["doj"] == 2004)).show(truncate=False)

+-----------+----------+---------+----+----------------+------+------+
|employee_id|first_name|last_name|doj |employee_dept_id|gender|salary|
+-----------+----------+---------+----+----------------+------+------+
|40         |Raja      |Singh    |2004|100             |F     |1000  |
+-----------+----------+---------+----+----------------+------+------+



##### Utilizando “startswith”

In [None]:
display(df.filter(df.first_name.startswith('Raj')))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
10,Raj,Kumar,1999,100,M,2000
40,Raja,Singh,2004,100,F,1000


In [None]:
display(df.filter(col('first_name').startswith('Raj')))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
10,Raj,Kumar,1999,100,M,2000
40,Raja,Singh,2004,100,F,1000


##### Utilizando “endswith”

In [None]:
display(df.filter(col('first_name').endswith('a')))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
40,Raja,Singh,2004,100,F,1000
50,Rama,Krish,2008,400,M,8000


##### Utilizando “contains”

In [None]:
display(df.filter(col('first_name').contains('Kumar')))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
70,Kumar,Chand,2004,600,M,7000


##### Utilizando “isNull”

In [None]:
display(df.filter(col('gender').isNull()))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
30,Raghav,Manish,2010,100,,5000


##### Utilizando “isNotNull”

In [None]:
display(df.filter(col('gender').isNotNull()))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
10,Raj,Kumar,1999,100,M,2000
20,Rahul,Rajan,2002,200,f,3000
40,Raja,Singh,2004,100,F,1000
50,Rama,Krish,2008,400,M,8000
60,Rasul,Kutty,2014,500,M,7000
70,Kumar,Chand,2004,600,M,7000


##### Utilizando “isin”

In [None]:
display(df.filter(col('employee_dept_id').isin(200,400)))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
20,Rahul,Rajan,2002,200,f,3000
50,Rama,Krish,2008,400,M,8000


(el ~ es como si colocaramos IS NOT IN, es decir, lo que no se encuentra en la lista)

In [None]:
display(df.filter(~col('employee_dept_id').isin(200,400)))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
10,Raj,Kumar,1999,100,M,2000
30,Raghav,Manish,2010,100,,5000
40,Raja,Singh,2004,100,F,1000
60,Rasul,Kutty,2014,500,M,7000
70,Kumar,Chand,2004,600,M,7000


##### Utilizando “like”

In [None]:
display(df.filter(col('first_name').like('%Kumar%')))

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
70,Kumar,Chand,2004,600,M,7000
