In [8]:
import findspark
findspark.init()
import pyspark, collections
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:////C:/temp").appName("FriendsByAge").getOrCreate()

def mapper(line):
    fields = line.split(',')
    return Row(ID = int(fields[0]), name = fields[1], age= int(fields[2]), numFriends=int(fields[3]))

lines = spark.sparkContext.textFile("fakefriends.csv")

people = lines.map(mapper)

# infer the schema and register the DataFrame as a table
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView("people") # creates temp table named "people"

#SQL can be run over a DataFrame that have been registered as a table

teenagers = spark.sql("SELECT * FROM people WHERE age >=13 AND age<=19") 

for teen in teenagers.collect():
    print(teen)
    

# we can use functions instead of SQL queries

schemaPeople.groupBy("age").count().orderBy('age').show()





Row(ID=21, age=19, name='Miles', numFriends=268)
Row(ID=52, age=19, name='Beverly', numFriends=269)
Row(ID=54, age=19, name='Brunt', numFriends=5)
Row(ID=106, age=18, name='Beverly', numFriends=499)
Row(ID=115, age=18, name='Dukat', numFriends=397)
Row(ID=133, age=19, name='Quark', numFriends=265)
Row(ID=136, age=19, name='Will', numFriends=335)
Row(ID=225, age=19, name='Elim', numFriends=106)
Row(ID=304, age=19, name='Will', numFriends=404)
Row(ID=341, age=18, name='Data', numFriends=326)
Row(ID=366, age=19, name='Keiko', numFriends=119)
Row(ID=373, age=19, name='Quark', numFriends=272)
Row(ID=377, age=18, name='Beverly', numFriends=418)
Row(ID=404, age=18, name='Kasidy', numFriends=24)
Row(ID=409, age=19, name='Nog', numFriends=267)
Row(ID=439, age=18, name='Data', numFriends=417)
Row(ID=444, age=18, name='Keiko', numFriends=472)
Row(ID=492, age=19, name='Dukat', numFriends=36)
Row(ID=494, age=18, name='Kasidy', numFriends=194)
+---+-----+
|age|count|
+---+-----+
| 18|    8|
| 19|   

In [9]:
teenagers

DataFrame[ID: bigint, age: bigint, name: string, numFriends: bigint]