In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
        .config("hive.metastore.uris", 
                "thrift://hive-metastore:9083") \
        .enableHiveSupport() \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR") # Keeps the noise down!!!

In [5]:
data = [(1, "Fido", "Dog","SPCA",1),(2, "Felix", "Cat", "SPCA",2),(3, "Rover", "Dog","SPCA",1)]
cols = ["id","name","type","shelter","years_at_shelter"]

In [6]:
spark.createDataFrame(data = data, schema = cols).show()

                                                                                

+---+-----+----+-------+----------------+
| id| name|type|shelter|years_at_shelter|
+---+-----+----+-------+----------------+
|  1| Fido| Dog|   SPCA|               1|
|  2|Felix| Cat|   SPCA|               2|
|  3|Rover| Dog|   SPCA|               1|
+---+-----+----+-------+----------------+



In [7]:
pets = spark.createDataFrame(data = data, schema = cols)

pets.show()

+---+-----+----+-------+----------------+
| id| name|type|shelter|years_at_shelter|
+---+-----+----+-------+----------------+
|  1| Fido| Dog|   SPCA|               1|
|  2|Felix| Cat|   SPCA|               2|
|  3|Rover| Dog|   SPCA|               1|
+---+-----+----+-------+----------------+



In [10]:
plan = pets.where("years_at_shelter=1").sort("name").select("name","type")
plan.explain()

== Physical Plan ==
*(2) Sort [name#32 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(name#32 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#80]
   +- *(1) Project [name#32, type#33]
      +- *(1) Filter (isnotnull(years_at_shelter#35L) AND (years_at_shelter#35L = 1))
         +- *(1) Scan ExistingRDD[id#31L,name#32,type#33,shelter#34,years_at_shelter#35L]




In [12]:
plan2 = pets.sort("name").select("name","type").where("years_at_shelter=1")
plan2.explain()

== Physical Plan ==
*(2) Sort [name#32 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(name#32 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#118]
   +- *(1) Project [name#32, type#33]
      +- *(1) Filter (isnotnull(years_at_shelter#35L) AND (years_at_shelter#35L = 1))
         +- *(1) Scan ExistingRDD[id#31L,name#32,type#33,shelter#34,years_at_shelter#35L]




In [13]:
spark.sql("""

create external table cdemo.grades (
    year int,
    semester string,
    course string,
    credit int,
    grade string
)
row format delimited
fields terminated by '\t'
location 'hdfs:///user/root/grades/*.tsv'

""")

DataFrame[]

In [14]:
spark.sql("select * from cdemo.grades").show()

+----+--------+------+------+-----+
|year|semester|course|credit|grade|
+----+--------+------+------+-----+
|2015|    Fall|IST101|     1|    A|
|2015|    Fall|IST195|     3|    A|
|2015|    Fall|IST233|     3|   B+|
|2015|    Fall|SOC101|     3|   A-|
|2015|    Fall|MAT221|     3|    C|
|2016|    Fall|IST346|     3|    A|
|2016|    Fall|CHE111|     4|   A-|
|2016|    Fall|PSY120|     3|   B+|
|2016|    Fall|IST256|     3|    A|
|2016|    Fall|ENG121|     3|   B+|
|2016|  Spring|GEO110|     3|   B+|
|2016|  Spring|MAT222|     3|    A|
|2016|  Spring|SOC121|     3|   C+|
|2016|  Spring|BIO240|     3|   B-|
|2017|  Spring|IST462|     3|    A|
|2017|  Spring|MAT411|     3|    C|
|2017|  Spring|SOC422|     3|   B-|
|2017|  Spring|ENV201|     3|   A-|
+----+--------+------+------+-----+



In [17]:
spark.sql("""

select sum(credit) as total_credits, year,semester
from cdemo.grades
group by year, semester
order by year, semester
""").explain()

== Physical Plan ==
*(3) Sort [year#153 ASC NULLS FIRST, semester#154 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(year#153 ASC NULLS FIRST, semester#154 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#187]
   +- *(2) HashAggregate(keys=[year#153, semester#154], functions=[sum(cast(credit#156 as bigint))])
      +- Exchange hashpartitioning(year#153, semester#154, 200), ENSURE_REQUIREMENTS, [id=#183]
         +- *(1) HashAggregate(keys=[year#153, semester#154], functions=[partial_sum(cast(credit#156 as bigint))])
            +- Scan hive cdemo.grades [year#153, semester#154, credit#156], HiveTableRelation [`cdemo`.`grades`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [year#153, semester#154, course#155, credit#156, grade#157], Partition Cols: []]




In [2]:
import pyspark
from pyspark.sql import SparkSession

bucket = "d-object-spark"

spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
        .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
        .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.1.2,org.apache.spark:spark-avro_2.12:3.1.2")\
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "minio") \
        .config("spark.hadoop.fs.s3a.secret.key", "SU2orange!") \
        .config("spark.hadoop.fs.s3a.fast.upload", True) \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .enableHiveSupport() \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR") # Keeps the noise down!!!

In [5]:
df = spark.read.csv("file:///home/jovyan/datasets/grades/fall2015.tsv")
df.show()

+--------------------+
|                 _c0|
+--------------------+
|2015	Fall	IST101	1	A|
|2015	Fall	IST195	3	A|
|2015	Fall	IST233	...|
|2015	Fall	SOC101	...|
|2015	Fall	MAT221	3	C|
+--------------------+



In [6]:
df = spark.read.csv("file:///home/jovyan/datasets/grades/")
df.show()

+--------------------+
|                 _c0|
+--------------------+
|2016	Fall	IST346	3	A|
|2016	Fall	CHE111	...|
|2016	Fall	PSY120	...|
|2016	Fall	IST256	3	A|
|2016	Fall	ENG121	...|
|2015	Fall	IST101	1	A|
|2015	Fall	IST195	3	A|
|2015	Fall	IST233	...|
|2015	Fall	SOC101	...|
|2015	Fall	MAT221	3	C|
|2016	Spring	GEO11...|
|2016	Spring	MAT22...|
|2016	Spring	SOC12...|
|2016	Spring	BIO24...|
|2017	Spring	IST46...|
|2017	Spring	MAT41...|
|2017	Spring	SOC42...|
|2017	Spring	ENV20...|
+--------------------+



In [8]:
df = spark.read.csv("s3a://unitd/grades/fall2015.tsv")
df.show()

+--------------------+
|                 _c0|
+--------------------+
|2015	Fall	IST101	1	A|
|2015	Fall	IST195	3	A|
|2015	Fall	IST233	...|
|2015	Fall	SOC101	...|
|2015	Fall	MAT221	3	C|
+--------------------+



In [9]:
df = spark.read.csv("s3a://unitd/grades/")
df.show()

+--------------------+
|                 _c0|
+--------------------+
|2016	Fall	IST346	3	A|
|2016	Fall	CHE111	...|
|2016	Fall	PSY120	...|
|2016	Fall	IST256	3	A|
|2016	Fall	ENG121	...|
|2015	Fall	IST101	1	A|
|2015	Fall	IST195	3	A|
|2015	Fall	IST233	...|
|2015	Fall	SOC101	...|
|2015	Fall	MAT221	3	C|
|2016	Spring	GEO11...|
|2016	Spring	MAT22...|
|2016	Spring	SOC12...|
|2016	Spring	BIO24...|
|2017	Spring	IST46...|
|2017	Spring	MAT41...|
|2017	Spring	SOC42...|
|2017	Spring	ENV20...|
+--------------------+



In [10]:
from pyspark import SparkFiles

spark.sparkContext.addFile("file:///home/jovyan/datasets/grades/fall2015.tsv")

print("Temporary Location: ", SparkFiles.get("fall2015.tsv"))

Temporary Location:  /tmp/spark-a2eaa4b7-cf7f-4c73-94c4-fc303b8d87ee/userFiles-bf6edabb-da6e-45fa-b9ed-df88e82dd375/fall2015.tsv


In [11]:
df = spark.read.csv(SparkFiles.get("fall2015.tsv"))
df.show()

+--------------------+
|                 _c0|
+--------------------+
|2015	Fall	IST101	1	A|
|2015	Fall	IST195	3	A|
|2015	Fall	IST233	...|
|2015	Fall	SOC101	...|
|2015	Fall	MAT221	3	C|
+--------------------+



In [17]:
df = spark.read.option("header",True).option("inferSchema",True) \
    .csv("file:///home/jovyan/datasets/stocks/stocks.csv")
df.printSchema()

root
 |-- price: double (nullable = true)
 |-- symbol: string (nullable = true)



In [23]:
df = spark.read.option("sep","\t").option("inferSchema",True).csv("file:///home/jovyan/datasets/grades/")
df.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: string (nullable = true)



In [25]:
df = spark.read.parquet("file:///home/jovyan/datasets/stocks/stocks.parquet")
df.show()

+-------+------+
|  price|symbol|
+-------+------+
| 126.82|  AAPL|
|3098.12|  AMZN|
| 251.11|    FB|
|1725.05|  GOOG|
| 128.39|   IBM|
| 212.55|  MSFT|
|   78.0|   NET|
|  497.0|  NFLX|
|  823.8|  TSLA|
|  45.11|  TWTR|
+-------+------+



In [28]:
df = spark.read.option("multiline",True).json("/home/jovyan/datasets/json-samples/stocks.json")

In [30]:
df.printSchema()

root
 |-- price: double (nullable = true)
 |-- symbol: string (nullable = true)



In [31]:
grades = spark.read.option("sep","\t").option("inferSchema",True).csv("file:///home/jovyan/datasets/grades/")
grades.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: string (nullable = true)



In [32]:
grades.show(5)

+----+----+------+---+---+
| _c0| _c1|   _c2|_c3|_c4|
+----+----+------+---+---+
|2016|Fall|IST346|  3|  A|
|2016|Fall|CHE111|  4| A-|
|2016|Fall|PSY120|  3| B+|
|2016|Fall|IST256|  3|  A|
|2016|Fall|ENG121|  3| B+|
+----+----+------+---+---+
only showing top 5 rows



In [33]:
grades2 = grades.withColumnRenamed("_c0","year")
grades2.show()

+----+------+------+---+---+
|year|   _c1|   _c2|_c3|_c4|
+----+------+------+---+---+
|2016|  Fall|IST346|  3|  A|
|2016|  Fall|CHE111|  4| A-|
|2016|  Fall|PSY120|  3| B+|
|2016|  Fall|IST256|  3|  A|
|2016|  Fall|ENG121|  3| B+|
|2015|  Fall|IST101|  1|  A|
|2015|  Fall|IST195|  3|  A|
|2015|  Fall|IST233|  3| B+|
|2015|  Fall|SOC101|  3| A-|
|2015|  Fall|MAT221|  3|  C|
|2016|Spring|GEO110|  3| B+|
|2016|Spring|MAT222|  3|  A|
|2016|Spring|SOC121|  3| C+|
|2016|Spring|BIO240|  3| B-|
|2017|Spring|IST462|  3|  A|
|2017|Spring|MAT411|  3|  C|
|2017|Spring|SOC422|  3| B-|
|2017|Spring|ENV201|  3| A-|
+----+------+------+---+---+



In [35]:
grades3 = grades.toDF("year", "semester", "course","credits", "grade")
grades3.show(5)

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|CHE111|      4|   A-|
|2016|    Fall|PSY120|      3|   B+|
|2016|    Fall|IST256|      3|    A|
|2016|    Fall|ENG121|      3|   B+|
+----+--------+------+-------+-----+
only showing top 5 rows



In [36]:
grades = spark.read.option("sep","\t").option("inferSchema",True) \
    .csv("file:///home/jovyan/datasets/grades/") \
    .toDF("year", "semester", "course","credits", "grade")
grades.printSchema()

root
 |-- year: integer (nullable = true)
 |-- semester: string (nullable = true)
 |-- course: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- grade: string (nullable = true)



In [38]:
grades.show(5)

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|CHE111|      4|   A-|
|2016|    Fall|PSY120|      3|   B+|
|2016|    Fall|IST256|      3|    A|
|2016|    Fall|ENG121|      3|   B+|
+----+--------+------+-------+-----+
only showing top 5 rows



In [44]:
from pyspark.sql.functions import col 
grades4 = grades.withColumn("next_year", col("year") + 1)
grades4.show(7)

+----+--------+------+-------+-----+---------+
|year|semester|course|credits|grade|next_year|
+----+--------+------+-------+-----+---------+
|2016|    Fall|IST346|      3|    A|     2017|
|2016|    Fall|CHE111|      4|   A-|     2017|
|2016|    Fall|PSY120|      3|   B+|     2017|
|2016|    Fall|IST256|      3|    A|     2017|
|2016|    Fall|ENG121|      3|   B+|     2017|
|2015|    Fall|IST101|      1|    A|     2016|
|2015|    Fall|IST195|      3|    A|     2016|
+----+--------+------+-------+-----+---------+
only showing top 7 rows



In [46]:
credletter = grades.select("credits","grade")
credletter.show()

+-------+-----+
|credits|grade|
+-------+-----+
|      3|    A|
|      4|   A-|
|      3|   B+|
|      3|    A|
|      3|   B+|
|      1|    A|
|      3|    A|
|      3|   B+|
|      3|   A-|
|      3|    C|
|      3|   B+|
|      3|    A|
|      3|   C+|
|      3|   B-|
|      3|    A|
|      3|    C|
|      3|   B-|
|      3|   A-|
+-------+-----+



In [47]:
grades.printSchema()

root
 |-- year: integer (nullable = true)
 |-- semester: string (nullable = true)
 |-- course: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- grade: string (nullable = true)



In [52]:
grades.select("year","semester").distinct().show()

+----+--------+
|year|semester|
+----+--------+
|2016|    Fall|
|2017|  Spring|
|2015|    Fall|
|2016|  Spring|
+----+--------+



In [53]:
grades.where("year = 2016").show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|CHE111|      4|   A-|
|2016|    Fall|PSY120|      3|   B+|
|2016|    Fall|IST256|      3|    A|
|2016|    Fall|ENG121|      3|   B+|
|2016|  Spring|GEO110|      3|   B+|
|2016|  Spring|MAT222|      3|    A|
|2016|  Spring|SOC121|      3|   C+|
|2016|  Spring|BIO240|      3|   B-|
+----+--------+------+-------+-----+



In [54]:
grades.filter("grade = 'A'").show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|IST256|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2015|    Fall|IST195|      3|    A|
|2016|  Spring|MAT222|      3|    A|
|2017|  Spring|IST462|      3|    A|
+----+--------+------+-------+-----+



In [55]:
grades.where( grades.grade == 'A' ).show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|IST256|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2015|    Fall|IST195|      3|    A|
|2016|  Spring|MAT222|      3|    A|
|2017|  Spring|IST462|      3|    A|
+----+--------+------+-------+-----+



In [56]:
grades.where( grades["grade"] == 'A' ).show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|IST256|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2015|    Fall|IST195|      3|    A|
|2016|  Spring|MAT222|      3|    A|
|2017|  Spring|IST462|      3|    A|
+----+--------+------+-------+-----+



In [57]:
grades.where("grade = 'A' or grade = 'A-'").show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|CHE111|      4|   A-|
|2016|    Fall|IST256|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2015|    Fall|IST195|      3|    A|
|2015|    Fall|SOC101|      3|   A-|
|2016|  Spring|MAT222|      3|    A|
|2017|  Spring|IST462|      3|    A|
|2017|  Spring|ENV201|      3|   A-|
+----+--------+------+-------+-----+



In [62]:
grades.where( (grades.grade == 'A') | (grades.grade == "-A") ).show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|    Fall|IST346|      3|    A|
|2016|    Fall|IST256|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2015|    Fall|IST195|      3|    A|
|2016|  Spring|MAT222|      3|    A|
|2017|  Spring|IST462|      3|    A|
+----+--------+------+-------+-----+



In [65]:
grades.sort("grade").show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|  Spring|MAT222|      3|    A|
|2017|  Spring|IST462|      3|    A|
|2016|    Fall|IST346|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2015|    Fall|IST195|      3|    A|
|2016|    Fall|IST256|      3|    A|
|2016|    Fall|CHE111|      4|   A-|
|2015|    Fall|SOC101|      3|   A-|
|2017|  Spring|ENV201|      3|   A-|
|2016|  Spring|GEO110|      3|   B+|
|2016|    Fall|PSY120|      3|   B+|
|2016|    Fall|ENG121|      3|   B+|
|2015|    Fall|IST233|      3|   B+|
|2017|  Spring|SOC422|      3|   B-|
|2016|  Spring|BIO240|      3|   B-|
|2015|    Fall|MAT221|      3|    C|
|2017|  Spring|MAT411|      3|    C|
|2016|  Spring|SOC121|      3|   C+|
+----+--------+------+-------+-----+



In [66]:
grades.sort(grades.grade.desc() ).show()

+----+--------+------+-------+-----+
|year|semester|course|credits|grade|
+----+--------+------+-------+-----+
|2016|  Spring|SOC121|      3|   C+|
|2015|    Fall|MAT221|      3|    C|
|2017|  Spring|MAT411|      3|    C|
|2016|  Spring|BIO240|      3|   B-|
|2017|  Spring|SOC422|      3|   B-|
|2016|    Fall|PSY120|      3|   B+|
|2015|    Fall|IST233|      3|   B+|
|2016|    Fall|ENG121|      3|   B+|
|2016|  Spring|GEO110|      3|   B+|
|2017|  Spring|ENV201|      3|   A-|
|2015|    Fall|SOC101|      3|   A-|
|2016|    Fall|CHE111|      4|   A-|
|2016|    Fall|IST346|      3|    A|
|2015|    Fall|IST195|      3|    A|
|2016|    Fall|IST256|      3|    A|
|2016|  Spring|MAT222|      3|    A|
|2015|    Fall|IST101|      1|    A|
|2017|  Spring|IST462|      3|    A|
+----+--------+------+-------+-----+



In [70]:
# semesters in which our grade was a "C"
x = grades.filter( (grades.grade == "C") | (grades.grade == "C+") | (grades.grade == "C-")) \
    .select("year","semester","course","grade") \
    .sort("course")

In [75]:
 y = x.toPandas()

In [76]:
y

Unnamed: 0,year,semester,course,grade
0,2015,Fall,MAT221,C
1,2017,Spring,MAT411,C
2,2016,Spring,SOC121,C+


In [84]:
from pyspark.sql.functions import sum,min,max,count,avg
grades.groupBy("year").agg( 
        count("year").alias("class_taken") 
).show()

+----+-----------+
|year|class_taken|
+----+-----------+
|2015|          5|
|2016|          9|
|2017|          4|
+----+-----------+



In [86]:
gs = grades.groupBy("year",grades.semester).agg(
    count("*").alias("classes_taken"),
    sum(grades.credits).alias("total_credits")
)


In [91]:
gs.show()

+----+--------+-------------+-------------+
|year|semester|classes_taken|total_credits|
+----+--------+-------------+-------------+
|2016|    Fall|            5|           16|
|2017|  Spring|            4|           12|
|2015|    Fall|            5|           13|
|2016|  Spring|            4|           12|
+----+--------+-------------+-------------+



In [95]:
gradepoints = spark.read.option("inferSchema",True) \
    .csv("file:///home/jovyan/datasets/courses/grade-points.csv") \
    .toDF("letter_grade","points")
gradepoints.show()

+------------+------+
|letter_grade|points|
+------------+------+
|           A|   4.0|
|          A-| 3.666|
|          B+| 3.333|
|           B|   3.0|
|          B-| 2.666|
|          C+| 2.333|
|           C|   2.0|
|          C-| 1.666|
|           D|   1.0|
|           F|   0.0|
+------------+------+



In [98]:
courses = spark.read.option("inferSchema",True) \
    .csv("file:///home/jovyan/datasets/courses/courses.csv") \
    .toDF("course_code","course_title")
courses.show()

+-----------+------------------+
|course_code|      course_title|
+-----------+------------------+
|     BIO240|        Biology II|
|     CHE111|        Chemstry I|
|     ENG121|    English Lit. I|
|     ENV201|       Env Science|
|     GEO110|         Geology I|
|     IST101|    Freshmen Forum|
|     IST195| Info Technologies|
|     IST233| Cloud Computing I|
|     IST256|     Programming I|
|     IST346|Cloud Computing II|
|     IST462|    Programming II|
|     IST344|    Info Reporting|
|     MAT221|      Statistics I|
|     MAT222|     Statistics II|
|     PSY120|      Psychology I|
|     SOC101|       Sociology I|
|     SOC121|      Sociology II|
+-----------+------------------+



In [104]:
c = grades.join(courses, courses.course_code == grades.course, "inner")

In [105]:
c.show()

+----+--------+------+-------+-----+-----------+------------------+
|year|semester|course|credits|grade|course_code|      course_title|
+----+--------+------+-------+-----+-----------+------------------+
|2016|    Fall|IST346|      3|    A|     IST346|Cloud Computing II|
|2016|    Fall|CHE111|      4|   A-|     CHE111|        Chemstry I|
|2016|    Fall|PSY120|      3|   B+|     PSY120|      Psychology I|
|2016|    Fall|IST256|      3|    A|     IST256|     Programming I|
|2016|    Fall|ENG121|      3|   B+|     ENG121|    English Lit. I|
|2015|    Fall|IST101|      1|    A|     IST101|    Freshmen Forum|
|2015|    Fall|IST195|      3|    A|     IST195| Info Technologies|
|2015|    Fall|IST233|      3|   B+|     IST233| Cloud Computing I|
|2015|    Fall|SOC101|      3|   A-|     SOC101|       Sociology I|
|2015|    Fall|MAT221|      3|    C|     MAT221|      Statistics I|
|2016|  Spring|GEO110|      3|   B+|     GEO110|         Geology I|
|2016|  Spring|MAT222|      3|    A|     MAT222|

In [108]:
c = grades.join(courses, courses.course_code == grades.course, "full")

In [109]:
c.show()

+----+--------+------+-------+-----+-----------+------------------+
|year|semester|course|credits|grade|course_code|      course_title|
+----+--------+------+-------+-----+-----------+------------------+
|2015|    Fall|SOC101|      3|   A-|     SOC101|       Sociology I|
|2016|  Spring|BIO240|      3|   B-|     BIO240|        Biology II|
|2017|  Spring|IST462|      3|    A|     IST462|    Programming II|
|2015|    Fall|IST101|      1|    A|     IST101|    Freshmen Forum|
|2016|  Spring|MAT222|      3|    A|     MAT222|     Statistics II|
|2017|  Spring|ENV201|      3|   A-|     ENV201|       Env Science|
|2015|    Fall|MAT221|      3|    C|     MAT221|      Statistics I|
|null|    null|  null|   null| null|     IST344|    Info Reporting|
|2016|  Spring|SOC121|      3|   C+|     SOC121|      Sociology II|
|2016|    Fall|CHE111|      4|   A-|     CHE111|        Chemstry I|
|2016|    Fall|IST346|      3|    A|     IST346|Cloud Computing II|
|2016|    Fall|PSY120|      3|   B+|     PSY120|

In [112]:
fallgrades = grades.select("year","semester","course").where("semester = 'Fall'")
springgrades = grades.select("year","semester","course").where("semester = 'Spring'")


In [113]:
cgrades = fallgrades.union(springgrades)
cgrades.show()

+----+--------+------+
|year|semester|course|
+----+--------+------+
|2016|    Fall|IST346|
|2016|    Fall|CHE111|
|2016|    Fall|PSY120|
|2016|    Fall|IST256|
|2016|    Fall|ENG121|
|2015|    Fall|IST101|
|2015|    Fall|IST195|
|2015|    Fall|IST233|
|2015|    Fall|SOC101|
|2015|    Fall|MAT221|
|2016|  Spring|GEO110|
|2016|  Spring|MAT222|
|2016|  Spring|SOC121|
|2016|  Spring|BIO240|
|2017|  Spring|IST462|
|2017|  Spring|MAT411|
|2017|  Spring|SOC422|
|2017|  Spring|ENV201|
+----+--------+------+



In [114]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *


In [119]:
@udf(returnType=BooleanType())
def inMajor(course):
    return course.startswith("IST")


In [121]:
w = grades.select("course", inMajor(grades.course).alias("course_in_major") )

In [122]:
w.printSchema()

root
 |-- course: string (nullable = true)
 |-- course_in_major: boolean (nullable = true)



In [123]:
w.show()

+------+---------------+
|course|course_in_major|
+------+---------------+
|IST346|           true|
|CHE111|          false|
|PSY120|          false|
|IST256|           true|
|ENG121|          false|
|IST101|           true|
|IST195|           true|
|IST233|           true|
|SOC101|          false|
|MAT221|          false|
|GEO110|          false|
|MAT222|          false|
|SOC121|          false|
|BIO240|          false|
|IST462|           true|
|MAT411|          false|
|SOC422|          false|
|ENV201|          false|
+------+---------------+



In [126]:
grades.select("course", inMajor(grades.course).alias("course_in_major") ) \
    .where(col("course_in_major") == True) \
    .show()


+------+---------------+
|course|course_in_major|
+------+---------------+
|IST346|           true|
|IST256|           true|
|IST101|           true|
|IST195|           true|
|IST233|           true|
|IST462|           true|
+------+---------------+



In [129]:
from pyspark.sql.functions import explode
places = spark.read.json("file:///home/jovyan/datasets/json-samples/google-places.json", multiLine=True)

In [132]:
places.printSchema()

root
 |-- business_status: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- location: struct (nullable = true)
 |    |    |-- lat: double (nullable = true)
 |    |    |-- lng: double (nullable = true)
 |    |-- viewport: struct (nullable = true)
 |    |    |-- northeast: struct (nullable = true)
 |    |    |    |-- lat: double (nullable = true)
 |    |    |    |-- lng: double (nullable = true)
 |    |    |-- southwest: struct (nullable = true)
 |    |    |    |-- lat: double (nullable = true)
 |    |    |    |-- lng: double (nullable = true)
 |-- icon: string (nullable = true)
 |-- icon_background_color: string (nullable = true)
 |-- icon_mask_base_uri: string (nullable = true)
 |-- name: string (nullable = true)
 |-- opening_hours: struct (nullable = true)
 |    |-- open_now: boolean (nullable = true)
 |-- photos: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- html_attributi

In [138]:
places.select("name", "geometry.location.lat", places.geometry.location.lng.alias("lng") ).show()

+--------------------+-----------------+------------------+
|                name|              lat|               lng|
+--------------------+-----------------+------------------+
|            Syracuse|       43.0481221|-76.14742439999999|
|Crowne Plaza Syra...|       43.0476078|       -76.1417642|
|  The Parkview Hotel|       43.0476157|        -76.140986|
|Jefferson Clinton...|       43.0472894|-76.15385049999999|
|Courtyard by Marr...|       43.0488846|       -76.1561175|
|Quality Inn & Sui...|43.05264399999999|-76.14681999999999|
| Syracuse University|       43.0391534|       -76.1351158|
|Collegian Hotel &...|       43.0464172|-76.13539879999999|
|  Dinosaur Bar-B-Que|       43.0526411|-76.15469379999999|
|Hotel Skyler Syra...|43.04396249999999|-76.13607999999999|
|Sheraton Syracuse...|43.04123120000001|       -76.1338203|
|Syracuse Crunch H...|       43.0446639|       -76.1481366|
| Mulroy Civic Center|       43.0457297|       -76.1483396|
|     Crouse Hospital|       43.0414634|

In [144]:
places.select("name", "geometry.location.lat", "geometry.location.lat", explode("types").alias("type") ).show()

+--------------------+---------------------+---------------------+-----------------+
|                name|geometry.location.lat|geometry.location.lat|             type|
+--------------------+---------------------+---------------------+-----------------+
|            Syracuse|           43.0481221|           43.0481221|         locality|
|            Syracuse|           43.0481221|           43.0481221|        political|
|Crowne Plaza Syra...|           43.0476078|           43.0476078|          lodging|
|Crowne Plaza Syra...|           43.0476078|           43.0476078|point_of_interest|
|Crowne Plaza Syra...|           43.0476078|           43.0476078|    establishment|
|  The Parkview Hotel|           43.0476157|           43.0476157|          lodging|
|  The Parkview Hotel|           43.0476157|           43.0476157|point_of_interest|
|  The Parkview Hotel|           43.0476157|           43.0476157|    establishment|
|Jefferson Clinton...|           43.0472894|           43.0472894

In [146]:
places.select("name", "geometry.location.lat", "geometry.location.lat", explode("types").alias("type") ) \
    .where(col("type") == 'establishment') \
    .show()

+--------------------+---------------------+---------------------+-------------+
|                name|geometry.location.lat|geometry.location.lat|         type|
+--------------------+---------------------+---------------------+-------------+
|Crowne Plaza Syra...|           43.0476078|           43.0476078|establishment|
|  The Parkview Hotel|           43.0476157|           43.0476157|establishment|
|Jefferson Clinton...|           43.0472894|           43.0472894|establishment|
|Courtyard by Marr...|           43.0488846|           43.0488846|establishment|
|Quality Inn & Sui...|    43.05264399999999|    43.05264399999999|establishment|
| Syracuse University|           43.0391534|           43.0391534|establishment|
|Collegian Hotel &...|           43.0464172|           43.0464172|establishment|
|  Dinosaur Bar-B-Que|           43.0526411|           43.0526411|establishment|
|Hotel Skyler Syra...|    43.04396249999999|    43.04396249999999|establishment|
|Sheraton Syracuse...|    43

In [147]:
grades = spark.read.option("header",False).option("inferSchema", True).option("sep", "\t")\
    .csv("file:///home/jovyan/datasets/grades/*.tsv")\
    .toDF("Year", "Semester", "Course", "Credits", "Grade")

termcredits = grades.groupBy("Year", "Semester").agg( \
    count("*").alias("CourseCount"), 
    sum("Credits").alias("TotalCredits") \
    ).sort("Year",col("Semester").desc())

final = termcredits.filter("Year=2016")

final.toPandas()

                                                                                

Unnamed: 0,Year,Semester,CourseCount,TotalCredits
0,2016,Spring,4,12
1,2016,Fall,5,16


In [148]:
final.explain()

== Physical Plan ==
*(3) Sort [Year#2098 ASC NULLS FIRST, Semester#2099 DESC NULLS LAST], true, 0
+- Exchange rangepartitioning(Year#2098 ASC NULLS FIRST, Semester#2099 DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#1493]
   +- *(2) HashAggregate(keys=[Year#2098, Semester#2099], functions=[count(1), sum(cast(Credits#2101 as bigint))])
      +- Exchange hashpartitioning(Year#2098, Semester#2099, 200), ENSURE_REQUIREMENTS, [id=#1489]
         +- *(1) HashAggregate(keys=[Year#2098, Semester#2099], functions=[partial_count(1), partial_sum(cast(Credits#2101 as bigint))])
            +- *(1) Project [_c0#2088 AS Year#2098, _c1#2089 AS Semester#2099, _c3#2091 AS Credits#2101]
               +- *(1) Filter (isnotnull(_c0#2088) AND (_c0#2088 = 2016))
                  +- FileScan csv [_c0#2088,_c1#2089,_c3#2091] Batched: false, DataFilters: [isnotnull(_c0#2088), (_c0#2088 = 2016)], Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/datasets/grades/fall2015.tsv, file:/home/jovyan/datas

In [149]:
b = grades.sort("Course") \
    .filter(grades.Semester == "Fall")\
    .select("Course", grades.Credits, grades["Grade"])\
    .filter("year = 2016")

b.explain()

== Physical Plan ==
*(2) Sort [Course#2100 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(Course#2100 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#1514]
   +- *(1) Project [_c2#2090 AS Course#2100, _c3#2091 AS Credits#2101, _c4#2092 AS Grade#2102]
      +- *(1) Filter (((isnotnull(_c1#2089) AND isnotnull(_c0#2088)) AND (_c1#2089 = Fall)) AND (_c0#2088 = 2016))
         +- FileScan csv [_c0#2088,_c1#2089,_c2#2090,_c3#2091,_c4#2092] Batched: false, DataFilters: [isnotnull(_c1#2089), isnotnull(_c0#2088), (_c1#2089 = Fall), (_c0#2088 = 2016)], Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/datasets/grades/fall2015.tsv, file:/home/jovyan/datasets/grad..., PartitionFilters: [], PushedFilters: [IsNotNull(_c1), IsNotNull(_c0), EqualTo(_c1,Fall), EqualTo(_c0,2016)], ReadSchema: struct<_c0:int,_c1:string,_c2:string,_c3:int,_c4:string>




In [150]:
a = grades.filter("year = 2016")\
    .filter(grades.Semester == "Fall")\
    .sort("Course") \
    .select("Course", grades.Credits, grades["Grade"])
b.explain()

== Physical Plan ==
*(2) Sort [Course#2100 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(Course#2100 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#1514]
   +- *(1) Project [_c2#2090 AS Course#2100, _c3#2091 AS Credits#2101, _c4#2092 AS Grade#2102]
      +- *(1) Filter (((isnotnull(_c1#2089) AND isnotnull(_c0#2088)) AND (_c1#2089 = Fall)) AND (_c0#2088 = 2016))
         +- FileScan csv [_c0#2088,_c1#2089,_c2#2090,_c3#2091,_c4#2092] Batched: false, DataFilters: [isnotnull(_c1#2089), isnotnull(_c0#2088), (_c1#2089 = Fall), (_c0#2088 = 2016)], Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/datasets/grades/fall2015.tsv, file:/home/jovyan/datasets/grad..., PartitionFilters: [], PushedFilters: [IsNotNull(_c1), IsNotNull(_c0), EqualTo(_c1,Fall), EqualTo(_c0,2016)], ReadSchema: struct<_c0:int,_c1:string,_c2:string,_c3:int,_c4:string>




In [151]:
gp = spark.read.json("file:///home/jovyan/datasets/json-samples/google-places.json")
c = spark.read.csv("file:///home/jovyan/datasets/customers/customers.csv", inferSchema=True, header=True)
s = spark.read.csv("file:///home/jovyan/datasets/customers/surveys.csv",inferSchema=True, header=True)
g = spark.read.csv("file:///home/jovyan/datasets/grades/*.tsv",inferSchema=False, header=False, sep="\t")


In [153]:
c.createOrReplaceTempView("customers")
s.createOrReplaceTempView("surveys")

In [156]:
spark.sql("select * from customers").show(1)

+-----+------+------------------+------+---------------+--------+-----+------------+---------------+---------------+
|First|  Last|             Email|Gender|Last IP Address|    City|State|Total Orders|Total Purchased|Months Customer|
+-----+------+------------------+------+---------------+--------+-----+------------+---------------+---------------+
|   Al|Fresco|afresco@dayrep.com|     M|  74.111.18.161|Syracuse|   NY|           1|             45|              1|
+-----+------+------------------+------+---------------+--------+-----+------------+---------------+---------------+
only showing top 1 row



In [157]:
c.show(1)

+-----+------+------------------+------+---------------+--------+-----+------------+---------------+---------------+
|First|  Last|             Email|Gender|Last IP Address|    City|State|Total Orders|Total Purchased|Months Customer|
+-----+------+------------------+------+---------------+--------+-----+------------+---------------+---------------+
|   Al|Fresco|afresco@dayrep.com|     M|  74.111.18.161|Syracuse|   NY|           1|             45|              1|
+-----+------+------------------+------+---------------+--------+-----+------------+---------------+---------------+
only showing top 1 row



In [163]:
query = '''
select c.Email, c.Gender, c.State, c.`Months Customer`, s.`Own Home`, s.`Household Income`
from customers c left join surveys s on 
        c.Email = s.Email
    where c.State = 'NY'
    and c.`Months Customer` > 5
    and s.`Own Home` is not null
'''
nybigwigs = spark.sql(query)
nybigwigs.show()

+--------------------+------+-----+---------------+--------+--------------------+
|               Email|Gender|State|Months Customer|Own Home|    Household Income|
+--------------------+------+-----+---------------+--------+--------------------+
|etasomthin@superr...|     M|   NY|             28|      No|               39000|
|   jpoole@dayrep.com|     F|   NY|             12|     Yes|Prefer not to Answer|
| ojouglad@einrot.com|     M|   NY|             36|      No|               65000|
| rovlight@dayrep.com|     M|   NY|             42|      No|               28000|
| sladd@superrito.com|     M|   NY|             10|     Yes|               52000|
+--------------------+------+-----+---------------+--------+--------------------+



In [165]:
nybigwigs.createOrReplaceTempView("v_nybigwigs")

In [168]:
mailinglist = spark.sql("select Email, Gender, `Own Home` from v_nybigwigs where `Own Home` = 'Yes'")

In [169]:
mailinglist.explain()

== Physical Plan ==
*(2) Project [Email#2193, Gender#2194, Own Home#2231]
+- *(2) BroadcastHashJoin [Email#2193], [Email#2227], Inner, BuildLeft, false
   :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]),false), [id=#1920]
   :  +- *(1) Project [Email#2193, Gender#2194]
   :     +- *(1) Filter ((((isnotnull(State#2197) AND isnotnull(Months Customer#2200)) AND (State#2197 = NY)) AND (Months Customer#2200 > 5)) AND isnotnull(Email#2193))
   :        +- FileScan csv [Email#2193,Gender#2194,State#2197,Months Customer#2200] Batched: false, DataFilters: [isnotnull(State#2197), isnotnull(Months Customer#2200), (State#2197 = NY), (Months Customer#2200..., Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/datasets/customers/customers.csv], PartitionFilters: [], PushedFilters: [IsNotNull(State), IsNotNull(Months Customer), EqualTo(State,NY), GreaterThan(Months Customer,5),..., ReadSchema: struct<Email:string,Gender:string,State:string,Months Customer:int>
  

In [171]:
from pyspark.sql.functions import udf
from pyspark.sql.types import * 

@udf(returnType=StringType()) 
def upperCase(str):
    return str.upper()

# This is different!!!!!
spark.udf.register("upper", upperCase)

spark.sql("select Email, upper(Email), Gender from v_nybigwigs").show()

+--------------------+--------------------+------+
|               Email|        upper(Email)|Gender|
+--------------------+--------------------+------+
|etasomthin@superr...|ETASOMTHIN@SUPERR...|     M|
|   jpoole@dayrep.com|   JPOOLE@DAYREP.COM|     F|
| ojouglad@einrot.com| OJOUGLAD@EINROT.COM|     M|
| rovlight@dayrep.com| ROVLIGHT@DAYREP.COM|     M|
| sladd@superrito.com| SLADD@SUPERRITO.COM|     M|
+--------------------+--------------------+------+



In [2]:
import pyspark
from pyspark.sql import SparkSession
mongo_uri = "mongodb://admin:mongopw@mongo:27017/admin?authSource=admin"

spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.mongodb.input.uri", mongo_uri) \
      .config("spark.mongodb.output.uri", mongo_uri) \
      .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [4]:
e = spark.read.format("mongo").option("database","demo").option("collection","europe").load()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- alpha2Code: string (nullable = true)
 |-- alpha3Code: string (nullable = true)
 |-- altSpellings: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- area: double (nullable = true)
 |-- borders: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- callingCodes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- capital: string (nullable = true)
 |-- currencies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- demonym: string (nullable = true)
 |-- gini: double (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- latlng: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- name: string (nullable = true)
 |-- nativeName: string (nullable = true)
 |-- numericCode: string (nullable = true)
 |-- population: long (nullable = t

In [9]:
from pyspark.sql.functions import explode, col
mongoq = e.select("alpha3Code", "name","subregion", "population", 
                  explode(col("borders")).alias("borderAlpha3Code")) \
    .filter("subregion = 'Northern Europe'")

In [11]:
mongoq.explain()

== Physical Plan ==
*(2) Project [alpha3Code#2, name#13, subregion#19, population#16L, borderAlpha3Code#170]
+- Generate explode(borders#5), [alpha3Code#2, name#13, population#16L, subregion#19], false, [borderAlpha3Code#170]
   +- *(1) Filter ((((size(borders#5, true) > 0) AND isnotnull(subregion#19)) AND (subregion#19 = Northern Europe)) AND isnotnull(borders#5))
      +- *(1) Scan MongoRelation(MongoRDD[0] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(alpha2Code,StringType,true), StructField(alpha3Code,StringType,true), StructField(altSpellings,ArrayType(StringType,true),true), StructField(area,DoubleType,true), StructField(borders,ArrayType(StringType,true),true), StructField(callingCodes,ArrayType(StringType,true),true), StructField(capital,StringType,true), StructField(currencies,ArrayType(StringType,true),true), StructField(demonym,StringType,true), StructField(gini,DoubleType,true), StructField(langua

In [12]:
s.show()

NameError: name 's' is not defined

In [13]:
stocks = spark.read.option("multiline","true").json("file:///home/jovyan/datasets/json-samples/stocks.json")


In [14]:
stocks.show()

+-------+------+
|  price|symbol|
+-------+------+
| 126.82|  AAPL|
|3098.12|  AMZN|
| 251.11|    FB|
|1725.05|  GOOG|
| 128.39|   IBM|
| 212.55|  MSFT|
|   78.0|   NET|
|  497.0|  NFLX|
|  823.8|  TSLA|
|  45.11|  TWTR|
+-------+------+



In [15]:
stocks.write.format("mongo") \
    .mode("overwrite").option("database","fdoc")\
    .option("collection","stocks1").save()

In [17]:
spark.read.format("mongo") \
    .option("database","fdoc")\
    .option("collection","stocks1").load().show()

+--------------------+-------+------+
|                 _id|  price|symbol|
+--------------------+-------+------+
|{6245c34eb2942b6c...| 126.82|  AAPL|
|{6245c34eb2942b6c...|3098.12|  AMZN|
|{6245c34eb2942b6c...| 251.11|    FB|
|{6245c34eb2942b6c...|1725.05|  GOOG|
|{6245c34eb2942b6c...| 128.39|   IBM|
|{6245c34eb2942b6c...| 212.55|  MSFT|
|{6245c34eb2942b6c...|   78.0|   NET|
|{6245c34eb2942b6c...|  497.0|  NFLX|
|{6245c34eb2942b6c...|  823.8|  TSLA|
|{6245c34eb2942b6c...|  45.11|  TWTR|
+--------------------+-------+------+



In [19]:
stocks2 = stocks.withColumn("_id", stocks.symbol)
stocks2.show()

+-------+------+----+
|  price|symbol| _id|
+-------+------+----+
| 126.82|  AAPL|AAPL|
|3098.12|  AMZN|AMZN|
| 251.11|    FB|  FB|
|1725.05|  GOOG|GOOG|
| 128.39|   IBM| IBM|
| 212.55|  MSFT|MSFT|
|   78.0|   NET| NET|
|  497.0|  NFLX|NFLX|
|  823.8|  TSLA|TSLA|
|  45.11|  TWTR|TWTR|
+-------+------+----+



In [20]:
stocks2.write.format("mongo") \
    .mode("overwrite").option("database","fdoc")\
    .option("collection","stocks2").save()

In [21]:
spark.read.format("mongo") \
    .option("database","fdoc")\
    .option("collection","stocks2").load().show()

+----+-------+------+
| _id|  price|symbol|
+----+-------+------+
|AAPL| 126.82|  AAPL|
|AMZN|3098.12|  AMZN|
|  FB| 251.11|    FB|
|GOOG|1725.05|  GOOG|
| IBM| 128.39|   IBM|
|MSFT| 212.55|  MSFT|
| NET|   78.0|   NET|
|NFLX|  497.0|  NFLX|
|TSLA|  823.8|  TSLA|
|TWTR|  45.11|  TWTR|
+----+-------+------+



In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# CASSANDRA CONFIGURATION
cassandra_host = "cassandra"
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.cassandra.connection.host", cassandra_host) \
      .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector-assembly_2.12:3.1.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [3]:
# WE NEED A TABLE BEFORE WE CAN WRITE, Using Plain old Python
!pip install -q cassandra-driver
from cassandra.cluster import Cluster
with Cluster([cassandra_host]) as cluster:
    session = cluster.connect()
    session.execute("CREATE KEYSPACE IF NOT EXISTS gdemo WITH replication={ 'class': 'SimpleStrategy', 'replication_factor' : 1 };")
    table = '''
    CREATE TABLE IF NOT EXISTS gdemo.fudgemart_order_details (
        customer_id int,
        customer_email text,
        customer_name text,
        customer_address text,
        customer_city text,
        customer_state text,
        customer_zip text,
        order_id int,
        order_date date,
        creditcard_number text,
        creditcard_exp_date text, 
        order_total decimal ,
        ship_via text,
        shipped_date date,
        product_id int,
        order_item_id int,
        order_qty int,
        product_name text,
        product_retail_price decimal,
    primary key ((customer_id, order_id), order_item_id) 
    );
    '''
    session.execute(table)

# NOTE: CSV File format does not understand dates, but Cassandra does, so we must cast the string columns to date before loading into the table
od = spark.read.option("inferSchema",True).option("header",True).csv("file:///home/jovyan/datasets/fudgemart/fudgemart-order-details.csv")\
    .withColumn("order_date", col("order_date").cast("date")).withColumn("shipped_date", col("shipped_date").cast("date")) 
    
od.write.format("org.apache.spark.sql.cassandra")\
  .mode("Append")\
  .option("table", "fudgemart_order_details")\
  .option("keyspace","gdemo")\
  .save()

                                                                                

In [4]:
fo =spark.read.format("org.apache.spark.sql.cassandra")\
    .options(table="fudgemart_order_details", keyspace="gdemo") \
    .load()

In [8]:
fo.select("customer_id","order_id","order_item_id").where("customer_id=13 and order_id=1843").explain()

== Physical Plan ==
*(1) Project [customer_id#151, order_id#152, order_item_id#153]
+- BatchScan[customer_id#151, order_id#152, order_item_id#153] Cassandra Scan: gdemo.fudgemart_order_details
 - Cassandra Filters: [["customer_id" = ?, 13],["order_id" = ?, 1843]]
 - Requested Columns: [customer_id,order_id,order_item_id]




In [10]:
fo.select("customer_id","order_id","order_item_id", "ship_via").where("ship_via='Postal Service'").explain()

== Physical Plan ==
*(1) Filter (ship_via#168 = Postal Service)
+- BatchScan[customer_id#151, order_id#152, order_item_id#153, ship_via#168] Cassandra Scan: gdemo.fudgemart_order_details
 - Cassandra Filters: []
 - Requested Columns: [customer_id,order_id,order_item_id,ship_via]




In [2]:
import pyspark
from pyspark.sql import SparkSession
# REDIS CONFIGURATION
redis_host = "redis"
redis_port = "6379"
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.redis.host", redis_host)\
      .config("spark.redis.port", redis_port)\
      .config("spark.jars.packages","com.redislabs:spark-redis_2.12:3.0.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [3]:
df = spark.read.option("multiline","true").json("/home/jovyan/datasets/json-samples/stocks.json")
df.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [5]:
df.write.format("org.apache.spark.sql.redis")\
    .mode("overwrite") \
    .option("table", "stocks") \
    .option("key.column","symbol").save()
    



In [7]:
posts = spark.read.format("org.apache.spark.sql.redis") \
    .option("keys.pattern","post:*") \
    .option("key.column","post_id") \
    .option("infer.schema", True) \
    .load()


In [11]:
users = spark.read.format("org.apache.spark.sql.redis") \
    .option("keys.pattern","user:*") \
    .option("key.column","userid") \
    .option("infer.schema", True) \
    .load()

In [13]:
users.join(posts, users.userid == posts.user_id, "inner").show()

                                                                                

+--------------------+--------+--------+------+--------------------+-------+----------+-------+
|                auth|password|username|userid|                body|user_id|      time|post_id|
+--------------------+--------+--------+------+--------------------+-------+----------+-------+
|5651e84b11d8fdabf...| testing|    mike|     1|        I am hungry!|      1|1648825104|      1|
|5651e84b11d8fdabf...| testing|    mike|     1|Working on some i...|      1|1648825124|      2|
|3c04b13163cbb4230...| testing|   alice|     2|   I am also hungry!|      2|1648825475|      3|
+--------------------+--------+--------+------+--------------------+-------+----------+-------+



In [14]:
users.join(posts, users.userid == posts.user_id, "inner").explain()

== Physical Plan ==
*(5) SortMergeJoin [userid#49], [user_id#14], Inner
:- *(2) Sort [userid#49 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(userid#49, 200), ENSURE_REQUIREMENTS, [id=#95]
:     +- *(1) Filter isnotnull(userid#49)
:        +- *(1) Scan org.apache.spark.sql.redis.RedisSourceRelation@1e26ca0b [auth#46,password#47,username#48,userid#49] PushedFilters: [IsNotNull(userid)], ReadSchema: struct<auth:string,password:string,username:string,userid:string>
+- *(4) Sort [user_id#14 ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(user_id#14, 200), ENSURE_REQUIREMENTS, [id=#101]
      +- *(3) Filter isnotnull(user_id#14)
         +- *(3) Scan org.apache.spark.sql.redis.RedisSourceRelation@30482432 [body#13,user_id#14,time#15,post_id#16] PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<body:string,user_id:string,time:string,post_id:string>




In [1]:
! sudo cp /home/jovyan/work/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar /usr/local/spark/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar

In [4]:
import pyspark
from pyspark.sql import SparkSession
# NEO4J  CONFIGURATION
bolt_url = "bolt://neo4j:7687"
# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [5]:
cols = ["facname","factitle","course","level","taught", "is_por"]
data = [('Mike','PoP', "IST256", "UGrad","Fall2021",False),('Mike','PoP', "IST659", "Grad","Spring2021",True),('Mike','PoP', "IST769", "Grad","Fall2021",False), ('Jill','Adjunct', "IST659", "Grad","Fall2021",False)]
profs = spark.createDataFrame(data = data, schema = cols)
profs.toPandas()

                                                                                

Unnamed: 0,facname,factitle,course,level,taught,is_por
0,Mike,PoP,IST256,UGrad,Fall2021,False
1,Mike,PoP,IST659,Grad,Spring2021,True
2,Mike,PoP,IST769,Grad,Fall2021,False
3,Jill,Adjunct,IST659,Grad,Fall2021,False


In [15]:
cipher_ql = '''
MATCH (c:Courses {code: event.course}), (f:Faculty {name: event.facname})
MERGE (f)-[:PROFESSOR_OF_RECORD]->(c)
'''
profs.where("is_por").write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()

In [14]:
cypher_ql = '''
MATCH (c:Courses {code: event.course}), (f:Faculty {name: event.facname})
MERGE (f)-[:TEACHES {semester: event.taught}]->(c)
'''
profs.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cypher_ql) \
  .save()

In [10]:
cypher_ql = "MERGE (c:Courses {code: event.course, level: event.level })"
profs.select("course","level").distinct().write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cypher_ql) \
  .save()

                                                                                

In [9]:
# Faculty
f = profs.select("facname","factitle").distinct()
cypher_ql = "MERGE (f:Faculty {name : event.facname, title: event.factitle})"

f.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cypher_ql) \
  .save()

                                                                                