Skip to content

Commit f8bfa30

Browse files
authored
Update II. Manipulating data.py
1 parent e1717f6 commit f8bfa30

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

Introduction to PySpark/II. Manipulating data.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
- .filter() -> like a cut for SQL
1313
> flights.filter("air_time > 120").show() # return values cut #(SQL string)
1414
> flights.filter(flights.air_time > 120).show() # return bool
15+
16+
- .select() -> returns only the columns you specify
17+
| > selectNoStr = flights.select(flights.origin, flights.dest, flights.carrier)
18+
| > selectStr = flights.select("tailnum", "origin", "dest")
19+
|
20+
- .withColumn() -> returns all columns in addition to the defined.
1521
"""
1622
#|
1723
#|
@@ -52,3 +58,17 @@
5258
#|
5359
#|
5460
### Selecting
61+
# Select the first set of columns
62+
selected1 = flights.select("tailnum", "origin", "dest")
63+
64+
# Select the second set of columns
65+
temp = flights.select(flights.origin, flights.dest, flights.carrier)
66+
67+
# Define first filter
68+
filterA = flights.origin == "SEA"
69+
70+
# Define second filter
71+
filterB = flights.dest == "PDX"
72+
73+
# Filter the data, first by filterA then by filterB
74+
selected2 = temp.filter(filterA).filter(filterB)

0 commit comments

Comments
 (0)