In [1]:
import findspark
import collections
from pyspark import SparkContext
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder \
    .appName("JupyterLocalSpark") \
    .master("local[*]") \
    .getOrCreate()

sc: SparkContext = spark.sparkContext

In [2]:
lines = sc.textFile("data/fakefriends.csv")
lines.take(5)

['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [3]:
from pyspark.sql import Row

def transform(line:str):
    line = line.split(',')
    return Row(
        ID=int(line[0]),
        name=str(line[1].encode("utf-8")),
        age=int(line[2]),
        numFriends=int(line[3])
    )
    
rdd = lines.map(transform)
rdd.take(5)

[Row(ID=0, name="b'Will'", age=33, numFriends=385),
 Row(ID=1, name="b'Jean-Luc'", age=26, numFriends=2),
 Row(ID=2, name="b'Hugh'", age=55, numFriends=221),
 Row(ID=3, name="b'Deanna'", age=40, numFriends=465),
 Row(ID=4, name="b'Quark'", age=68, numFriends=21)]

In [4]:
schemaPeople = spark.createDataFrame(rdd).cache()
schemaPeople.createOrReplaceTempView("people")

In [5]:
teenagers = spark.sql(
    """
        SELECT * 
        FROM people 
        WHERE age >= 13 AND age <= 19
    """)

teenagers.take(5)

[Row(ID=21, name="b'Miles'", age=19, numFriends=268),
 Row(ID=52, name="b'Beverly'", age=19, numFriends=269),
 Row(ID=54, name="b'Brunt'", age=19, numFriends=5),
 Row(ID=106, name="b'Beverly'", age=18, numFriends=499),
 Row(ID=115, name="b'Dukat'", age=18, numFriends=397)]