In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (SparkSession.builder.appName("learnspark").getOrCreate())

In [4]:
#To define a schema for our data in pyspark using DDL
schema = "id INT, first STRING, last STRING, url STRING, published STRING, hits INT, campaigns ARRAY<STRING>"

In [6]:
#creating our static data
data =[
    [1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIn"]],
    [2, "Brooke","Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter","LinkedIn"]],
    [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web","twitter", "FB", "LinkedIn"]],
    [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568,["twitter", "FB"]],
    [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web","twitter", "FB", "LinkedIn"]],
    [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568,["twitter", "LinkedIn"]]]

In [7]:
#create a dataframe using the data and defined schema
blogs_df = spark.createDataFrame(data,schema)
blogs_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| id|    first|   last|              url|published| hits|           campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [8]:
print(blogs_df.printSchema())

root
 |-- id: integer (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- url: string (nullable = true)
 |-- published: string (nullable = true)
 |-- hits: integer (nullable = true)
 |-- campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)

None


In [9]:
blogs_df.select(['id','first','campaigns']).where(blogs_df['hits']>1000).show()

+---+---------+--------------------+
| id|    first|           campaigns|
+---+---------+--------------------+
|  1|    Jules| [twitter, LinkedIn]|
|  2|   Brooke| [twitter, LinkedIn]|
|  3|    Denny|[web, twitter, FB...|
|  4|Tathagata|       [twitter, FB]|
|  5|    Matei|[web, twitter, FB...|
|  6|  Reynold| [twitter, LinkedIn]|
+---+---------+--------------------+



In [10]:
blogs_df.withColumn('hits2', blogs_df['hits']/5).show()

+---+---------+-------+-----------------+---------+-----+--------------------+------+
| id|    first|   last|              url|published| hits|           campaigns| hits2|
+---+---------+-------+-----------------+---------+-----+--------------------+------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]| 907.0|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|1781.6|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|1531.8|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|2113.6|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|8115.6|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|5113.6|
+---+---------+-------+-----------------+---------+-----+--------------------+------+



In [11]:
newrow = spark.createDataFrame([(7, 'moris', 'mumo', 'www.mumo.com', '7/7/2015', 30000, ['facebook', 'twitter'])], schema=blogs_df.schema)


In [12]:
newdf = blogs_df.union(newrow).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| id|    first|   last|              url|published| hits|           campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  7|    moris|   mumo|     www.mumo.com| 7/7/2015|30000| [facebook, twitter]|
+---+---------+-------+-----------------+---------+-----+--------------------+



#IMPORTING LARGE DATA: Consider THREE options;
1. inferschema = True, samplingRatio = 0.001
#Spark can infer schema from a sample at a lesser cost
2. pragramatically infer the schema directly by having full control of how the schema is defined. it's cheaper and saves computing resources. PREFFERED FOR complex & custom data types. used in production-grade code
3. spark.createDataFrame() with a column list: USE THIS FOR SIMPLE DATA i.e initial exploration and prototyping

In [17]:
#let's practice option 2
from pyspark.sql.types import *

In [18]:
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
StructField('UnitID', StringType(), True),
StructField('IncidentNumber', IntegerType(), True),
StructField('CallType', StringType(), True),
StructField('CallDate', StringType(), True),
StructField('WatchDate', StringType(), True),
StructField('CallFinalDisposition', StringType(), True),
StructField('AvailableDtTm', StringType(), True),
StructField('Address', StringType(), True),
StructField('City', StringType(), True),
StructField('Zipcode', IntegerType(), True),
StructField('Battalion', StringType(), True),
StructField('StationArea', StringType(), True),
StructField('Box', StringType(), True),
StructField('OriginalPriority', StringType(), True),
StructField('Priority', StringType(), True),
StructField('FinalPriority', IntegerType(), True),
StructField('ALSUnit', BooleanType(), True),
StructField('CallTypeGroup', StringType(), True),
StructField('NumAlarms', IntegerType(), True),
StructField('UnitType', StringType(), True),
StructField('UnitSequenceInCallDispatch', IntegerType(), True),
StructField('FirePreventionDistrict', StringType(), True),
StructField('SupervisorDistrict', StringType(), True),
StructField('Neighborhood', StringType(), True),
StructField('Location', StringType(), True),
StructField('RowID', StringType(), True),
StructField('Delay', FloatType(), True)])

In [19]:
sf_fire_file = '/data/sf-fire-calls.csv'

In [22]:
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)