In [1]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [2]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate() #spark handle/object

In [3]:
df = spark.createDataFrame(pandas_dataframe)

In [7]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [8]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [9]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



In [10]:
df.select('group', 'abool').show()

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
|    y|false|
|    z|false|
|    x|false|
|    z| true|
|    y| true|
|    x|false|
|    y|false|
|    y| true|
|    y| true|
|    y|false|
|    x|false|
|    z|false|
|    y| true|
|    x|false|
|    x| true|
+-----+-----+



In [11]:
df.columns

['n', 'group', 'abool']

In [12]:
df.select('group', df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [13]:
df.select('group', df.n.alias('a_numeric_value')).show(8)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
|    z|  1.3501878997225267|
|    x|  0.8612113741693206|
+-----+--------------------+
only showing top 8 rows



In [15]:
from pyspark.sql.functions import col
df.select('group', col('n').alias('a_numeric_value')).show(8)
#different ways to do the same thing

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
|    z|  1.3501878997225267|
|    x|  0.8612113741693206|
+-----+--------------------+
only showing top 8 rows



In [20]:
df.select('n', df.n+4).show(5)

+--------------------+------------------+
|                   n|           (n + 4)|
+--------------------+------------------+
|  -0.712390662050588|3.2876093379494122|
|   0.753766378659703| 4.753766378659703|
|-0.04450307833805...|3.9554969216619464|
| 0.45181233874578974|  4.45181233874579|
|  1.3451017084510097|5.3451017084510095|
+--------------------+------------------+
only showing top 5 rows



In [21]:
df.select('n', df.n-5).show(5)

+--------------------+-------------------+
|                   n|            (n - 5)|
+--------------------+-------------------+
|  -0.712390662050588| -5.712390662050588|
|   0.753766378659703| -4.246233621340297|
|-0.04450307833805...| -5.044503078338053|
| 0.45181233874578974|  -4.54818766125421|
|  1.3451017084510097|-3.6548982915489905|
+--------------------+-------------------+
only showing top 5 rows



In [22]:
df.select('n', df.n*2).show(2)

+------------------+------------------+
|                 n|           (n * 2)|
+------------------+------------------+
|-0.712390662050588|-1.424781324101176|
| 0.753766378659703| 1.507532757319406|
+------------------+------------------+
only showing top 2 rows



In [23]:
df.select('n', df.n/2).show(2)

+------------------+------------------+
|                 n|           (n / 2)|
+------------------+------------------+
|-0.712390662050588|-0.356195331025294|
| 0.753766378659703|0.3768831893298515|
+------------------+------------------+
only showing top 2 rows



In [24]:
df = df.select('*', (df.n * -1).alias('n2'))

In [25]:
df.show(4)

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|
|   0.753766378659703|    x|false|  -0.753766378659703|
|-0.04450307833805...|    z|false|0.044503078338053455|
| 0.45181233874578974|    y|false|-0.45181233874578974|
+--------------------+-----+-----+--------------------+
only showing top 4 rows



In [26]:
df = df.select('*', (df.n * df.n).alias('n3'))
df.show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



In [28]:
df.group + df.abool
#column object is created

Column<'(group + abool)'>

In [29]:
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias((cast(group#1 as double) + abool#2), Some(org.apache.spark.sql.Column$$Lambda$3285/0x0000000801268840@2ea17cc8))]
+- Project [n#0, group#1, abool#2, n2#310, (n#0 * n#0) AS n3#332]
   +- Project [n#0, group#1, abool#2, (n#0 * cast(-1 as double)) AS n2#310]
      +- LogicalRDD [n#0, group#1, abool#2], false


In [30]:
# An error is produced referencing the incompatible types. Unlike the previous code sample, 
#this one is done within the context of a .select, so even though there are still no values 
#produced (we haven't invoked an action yet), spark is aware that the types are incompatible.

In [31]:
# view datatypes in your dataframe
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)
 |-- n2: double (nullable = true)
 |-- n3: double (nullable = true)

