In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
spark = SparkSession.builder.appName('Basics').getOrCreate()
data_schema = [
    StructField('age', IntegerType(), True),
    StructField('name', StringType(), True)
]
final_structure = StructType(fields=data_schema)
df = spark.read.json(
    '/Users/lisaac/spark-2.2.0-bin-hadoop2.7/examples/src/main/resources/people.json', 
    schema=final_structure
)

In [5]:
# Accessing via list notation would return a Column Object
type(df['age'])

pyspark.sql.column.Column

In [6]:
# Accessing via select method it returns the column as a DF. More easily to be accessed
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [8]:
# LIMIT a quantity of returned rows objects. /!\ It's not a DF object. /!\
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [11]:
# to select more than one column
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [13]:
# withColumn creates a new column or replace one
df.withColumn('double_age', df['age'] * 2).show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [14]:
# withColumn not persist the changes. To do that asign the result to a variable
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [17]:
# withColumnRemane set a name for a column. Again, to save the results asign to a variable
df.withColumnRenamed('age', 'age_new_name').show()

+------------+-------+
|age_new_name|   name|
+------------+-------+
|        null|Michael|
|          30|   Andy|
|          19| Justin|
+------------+-------+



In [18]:
# We can also query the results thru SQL. First Create a Temporary Table View
df.createOrReplaceTempView('people')

In [21]:
# Then use the method spark.sql to make the query
results = spark.sql('SELECT * FROM people')
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

