In [1]:
from pyspark.sql import SparkSession, Row

spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

sc = spark.sparkContext

# Load a text file and convert each line to a Row.
lines = sc.textFile("/tools/spark-2.4.5-bin-hadoop2.7/examples/src/main/resources/people.txt")

parts = lines.map(lambda l: l.split(","))
print(parts.collect())

people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
print(people.collect())

# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")
# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
#teenagers.show()

# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
    print(name)
    
teenNames = teenagers.select(teenagers["name"]).collect()
for name in teenNames:
    print(name[0])

spark.stop()



[['Michael', ' 29'], ['Andy', ' 30'], ['Justin', ' 19']]
[Row(age=29, name='Michael'), Row(age=30, name='Andy'), Row(age=19, name='Justin')]
Name: Justin
Justin
