In [1]:
#Spark Imports and Get Context

#https://spark.apache.org/docs/latest/sql-getting-started.html
from pyspark import SparkContext, SparkConf, SQLContext
from os import getcwd

conf = SparkConf().setAppName('SparkS1')
sc = SparkContext(conf=conf).getOrCreate()
spark = SQLContext.getOrCreate(sc)

In [3]:
#Import JSON 
movieJDF = spark.read.json('../datasets/movie/movies_en_1_0.txt')
movieJDF.show(5)

+--------------------+-------+--------------------+---------------+-------+--------------------+-------------+----+
|              actors|country|            director|          genre|     id|             summary|        title|year|
+--------------------+-------+--------------------+---------------+-------+--------------------+-------------+----+
|[{artist:15, John...|    USA|{Alfred, artist:3...|          Drama|movie:1|A retired San Fra...|      Vertigo|1958|
|[{artist:5, Ripley}]|    USA|{Ridley, artist:4...|Science-Fiction|movie:2|The commercial ve...|        Alien|1979|
|[{artist:109, Ros...|    USA|{James, artist:6,...|          Drama|movie:3|A seventeen-year-...|      Titanic|1997|
|                  []|     FR|{Andrei, artist:9...|          drama|movie:4|                null|The Sacrifice|1986|
|[{artist:11, Sean...|    USA|{John, artist:10,...|         Action|movie:5|To foil an extort...|     Face/Off|1997|
+--------------------+-------+--------------------+---------------+-----

In [5]:
from pyspark.sql.functions import explode,col

movieJDF1 = movieJDF.withColumn('actor',explode(col('actors'))).select('id','title','actor')
movieJDF2 = movieJDF1.select('actor.id','actor.role')
movieJDF1.show(5)
movieJDF2.show(5)

+-------+-------+--------------------+
|     id|  title|               actor|
+-------+-------+--------------------+
|movie:1|Vertigo|{artist:15, John ...|
|movie:1|Vertigo|{artist:16, Madel...|
|movie:2|  Alien|  {artist:5, Ripley}|
|movie:3|Titanic|{artist:109, Rose...|
|movie:3|Titanic|{artist:110, Jack...|
+-------+-------+--------------------+
only showing top 5 rows

+----------+-------------------+
|        id|               role|
+----------+-------------------+
| artist:15|      John Ferguson|
| artist:16|   Madeleine Elster|
|  artist:5|             Ripley|
|artist:109|Rose DeWitt Bukater|
|artist:110|        Jack Dawson|
+----------+-------------------+
only showing top 5 rows



In [6]:
#streaming employee csv file and joining with office table
from pyspark.sql.types import *

empschema = StructType([
  StructField('employeeNumber', IntegerType()),
  StructField('lastName', StringType()),
  StructField('firstName', StringType()),
  StructField('extention', StringType()),
  StructField('email', StringType()),
  StructField('officeCode', IntegerType()),
  StructField('reportsTo', IntegerType()),
  StructField('jobTitle', StringType())
]

)


empDataFrame1 = (spark.readStream.format('csv').option('header','true').schema(empschema).option('maxFilesPerTrigger',1).load('../datasets/stream/')
)

In [7]:
officeDF1 = spark.read.csv(
'../datasets/classicmodels/office.csv',
header=True,
inferSchema = True,
nullValue='null'
#maxFilesPerTrigger = 1
)

empOfficeDF = empDataFrame1.join(officeDF1,'officeCode','inner').select('employeeNumber','lastName','firstName','officeCode','city','state','country','postalCode')

In [8]:
empStream = (empOfficeDF.writeStream.format('csv')
            .outputMode('append')
            .option('checkpointLocation','../datasets/stream/check4')
            .start('../datasets/stream/out5')
           )

In [9]:
empStream.awaitTermination(5)
empStream.stop()

In [12]:
%ls ../datasets/stream/out5

part-00000-43921bfe-16a0-4d82-a3af-9cfdf4428e02-c000.csv
part-00000-8cd84e91-09ee-42af-93f4-4bc59e05c6b4-c000.csv
part-00000-94d0c93d-9d35-4173-ae1c-72e6504883e4-c000.csv
part-00000-cb520fa6-8c47-4c10-ad28-75acdfcf823e-c000.csv
part-00000-f533a849-2910-4dff-9804-f61d50d070b4-c000.csv
[0m[38;5;27m_spark_metadata[0m/


In [13]:
%cat ../datasets/stream/out5/part-00000-43921bfe-16a0-4d82-a3af-9cfdf4428e02-c000.csv

1002,Murphy,Diane,1,San Francisco,CA,USA,94080
1056,Patterson,Mary,1,San Francisco,CA,USA,94080
1076,Firrelli,Jeff,1,San Francisco,CA,USA,94080
1088,Patterson,William,6,Sydney,"",Australia,NSW 2010
1102,Bondur,Gerard,4,Paris,NULL,France,75017
1143,Bow,Anthony,1,San Francisco,CA,USA,94080
1165,Jennings,Leslie,1,San Francisco,CA,USA,94080
1166,Thompson,Leslie,1,San Francisco,CA,USA,94080
1188,Firrelli,Julie,2,Boston,MA,USA,2107
1216,Patterson,Steve,2,Boston,MA,USA,2107
1286,Tseng,Foon Yue,3,NYC,NY,USA,10022
1323,Vanauf,George,3,NYC,NY,USA,10022
1337,Bondur,Loui,4,Paris,NULL,France,75017
1370,Hernandez,Gerard,4,Paris,NULL,France,75017
1401,Castillo,Pamela,4,Paris,NULL,France,75017
1501,Bott,Larry,7,London,"",UK,EC2N 1HN
1504,Jones,Barry,7,London,"",UK,EC2N 1HN
1611,Fixter,Andy,6,Sydney,"",Australia,NSW 2010
1612,Marsh,Peter,6,Sydney,"",Australia,NSW 2010
1619,King,Tom,6,Sydney,"",Australia,NSW 2010
1621,Nishi,Mami,5,Tokyo,Chiyoda-Ku,Japan,102-8578
1625,Kato,Yoshimi,5,Tokyo,Chiyoda-Ku,Japa