In [50]:
import pandas as pd
import numpy as np

In [3]:
pdf = pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])])
pdf

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [4]:
pdf.A

0    1
1    2
2    3
Name: A, dtype: int64

In [6]:
df = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"])
df.show()                           

+---+---+
|  A|  B|
+---+---+
|  1|  4|
|  2|  5|
|  3|  6|
+---+---+



In [10]:
print(pdf.A)
print(pdf['A'])
print(df.A)
print(df['A'])

0    1
1    2
2    3
Name: A, dtype: int64
0    1
1    2
2    3
Name: A, dtype: int64
Column<b'A'>
Column<b'A'>


In [11]:
pdf['C'] = 0
pdf

Unnamed: 0,A,B,C
0,1,4,0
1,2,5,0
2,3,6,0


In [14]:
df = df.withColumn('C', 0)

AssertionError: col should be Column

In [13]:
from pyspark.sql import functions as F
df = df.withColumn('C', F.lit(0))
df.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1|  4|  0|
|  2|  5|  0|
|  3|  6|  0|
+---+---+---+



In [15]:
df.withColumn('C', df.B > 0).show()

+---+---+----+
|  A|  B|   C|
+---+---+----+
|  1|  4|true|
|  2|  5|true|
|  3|  6|true|
+---+---+----+



In [16]:
df.withColumn('D', df.A * df.B).show()

+---+---+---+---+
|  A|  B|  C|  D|
+---+---+---+---+
|  1|  4|  0|  4|
|  2|  5|  0| 10|
|  3|  6|  0| 18|
+---+---+---+---+



In [19]:
df.select(df.C > 0).show()

+-------+
|(C > 0)|
+-------+
|  false|
|  false|
|  false|
+-------+



In [21]:
df.select('B' > 0).show()

TypeError: unorderable types: str() > int()

In [22]:
df.select(df['B'] > 0).show()

+-------+
|(B > 0)|
+-------+
|   true|
|   true|
|   true|
+-------+



In [25]:
df.select((df.B > 0).alias("is_positive")).show()

+-----------+
|is_positive|
+-----------+
|       true|
|       true|
|       true|
+-----------+



In [27]:
pdf[(pdf.A > 1) & (pdf.B > 4)]

Unnamed: 0,A,B,C
1,2,5,0
2,3,6,0


In [30]:
df.filter((df.A > 1) & (df.B > 4)).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  2|  5|  0|
|  3|  6|  0|
+---+---+---+



In [33]:
df[(df.A > 1) & (df.B > 4)].show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  2|  5|  0|
|  3|  6|  0|
+---+---+---+



In [34]:
spark.createDataFrame(pdf).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1|  4|  0|
|  2|  5|  0|
|  3|  6|  0|
+---+---+---+



In [36]:
df.groupBy('A').avg('B').show()

+---+------+
|  A|avg(B)|
+---+------+
|  1|   4.0|
|  3|   6.0|
|  2|   5.0|
+---+------+



In [42]:
pdf.groupby('A')['B'].mean()

A
1    4
2    5
3    6
Name: B, dtype: int64

In [48]:
df.groupby('A').avg('B').describe().show()

+-------+---+------+
|summary|  A|avg(B)|
+-------+---+------+
|  count|  3|     3|
|   mean|2.0|   5.0|
| stddev|1.0|   1.0|
|    min|  1|   4.0|
|    max|  3|   6.0|
+-------+---+------+



In [49]:
df.groupBy('A').agg(F.count('B'), F.max('B'), F.min('C')).show()

+---+--------+------+------+
|  A|count(B)|max(B)|min(C)|
+---+--------+------+------+
|  1|       1|     4|     0|
|  3|       1|     6|     0|
|  2|       1|     5|     0|
+---+--------+------+------+



In [60]:
pdf.groupby('A')['B'].agg(['count', 'max','mean']) 

Unnamed: 0_level_0,count,max,mean
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4,4
2,1,5,5
3,1,6,6


In [64]:
pdf.groupby('A').agg({'B':'count', 'C':'min'})

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,1,0
