In [74]:
import findspark


In [75]:
import findspark
findspark.init('c:/Spark')

In [154]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [77]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
print(data)

[('Java', '20000'), ('Python', '100000'), ('Scala', '3000')]


In [78]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
rdd = spark.sparkContext.parallelize(data)

In [79]:
dfFromRDD1 = rdd.toDF()
dfFromRDD1.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [80]:
dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



In [81]:
dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [82]:
# Create a dataframe from a list of tuples.
dfFromData2 = spark.createDataFrame(data).toDF(*columns)
dfFromData2.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [83]:
# Using Row Type to create a dataframe
rowData = map(lambda x: Row(*x), data)
print(rowData)
dfFromData3 = spark.createDataFrame(rowData, columns)
dfFromData3.show()

<map object at 0x000001DE32BB0448>
+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [84]:
# create a dataframe from csv, json, or text file.
df2 = spark.read.csv('C:/SparkCourse/1800.csv')
df2 = spark.read.json('C:/SparkCourse/Marvel_BFS_RDD.json')
df2 = spark.read.text('C:/SparkCourse/Marvel-Names.txt')

In [85]:
arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]    


In [86]:
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])

In [87]:
df = spark.createDataFrame(arrayStructureData, arrayStructureSchema)
df.show()
df.printSchema()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [88]:
# Filter dataframe with a column condition.
df.filter(df.state =='OH') \
.show()


+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [89]:
# Another way of doing it
df.filter(col('state')== 'OH').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [90]:
# Similar to sql, can just pass in what you'd include in the where clause"
df.filter("gender == 'F'").show()

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
|[Julia, , Williams]|      [CSharp, VB]|   OH|     F|
+-------------------+------------------+-----+------+



In [91]:
# Filter with multiple conditons. Males from Ohio
df.filter("gender == 'M' and state = 'OH'").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [92]:
# Filtering on an array column.  Only those will JAVA lanugage
df.filter(array_contains(df.languages, 'Java')).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|[James, , Smith]|[Java, Scala, C++]|   OH|     M|
|  [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+



In [93]:
# Filtering on a Nested Struct columns. Last name equal to Williams.
df.filter(df.name.lastname == 'Williams').show()

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
| [Julia, , Williams]|[CSharp, VB]|   OH|     F|
|[Mike, Mary, Will...|[Python, VB]|   OH|     M|
+--------------------+------------+-----+------+



In [97]:
# Read CSV files into a dataframe - no headers, all columns passed in as a string type.
df = spark.read.csv('C:/SparkCourse/fakefriends.csv')
df.printSchema()
df.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+--------+---+---+
|_c0|     _c1|_c2|_c3|
+---+--------+---+---+
|  0|    Will| 33|385|
|  1|Jean-Luc| 26|  2|
|  2|    Hugh| 55|221|
|  3|  Deanna| 40|465|
|  4|   Quark| 68| 21|
|  5|  Weyoun| 59|318|
|  6|  Gowron| 37|220|
|  7|    Will| 54|307|
|  8|  Jadzia| 38|380|
|  9|    Hugh| 27|181|
| 10|     Odo| 53|191|
| 11|     Ben| 57|372|
| 12|   Keiko| 54|253|
| 13|Jean-Luc| 56|444|
| 14|    Hugh| 43| 49|
| 15|     Rom| 36| 49|
| 16|  Weyoun| 22|323|
| 17|     Odo| 35| 13|
| 18|Jean-Luc| 45|455|
| 19|  Geordi| 60|246|
+---+--------+---+---+
only showing top 20 rows



In [99]:
# Read CSV files into a dataframe - inferSchema = True will read correct data type, 
# delimter option is bonus here and is not needed. header = 'True' option is for
# when the first row has column names.
df = spark.read.options(inferSchema = 'True', delimter= ',') \
.csv('C:/SparkCourse/fakefriends.csv')
df.printSchema()
df.show()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)

+---+--------+---+---+
|_c0|     _c1|_c2|_c3|
+---+--------+---+---+
|  0|    Will| 33|385|
|  1|Jean-Luc| 26|  2|
|  2|    Hugh| 55|221|
|  3|  Deanna| 40|465|
|  4|   Quark| 68| 21|
|  5|  Weyoun| 59|318|
|  6|  Gowron| 37|220|
|  7|    Will| 54|307|
|  8|  Jadzia| 38|380|
|  9|    Hugh| 27|181|
| 10|     Odo| 53|191|
| 11|     Ben| 57|372|
| 12|   Keiko| 54|253|
| 13|Jean-Luc| 56|444|
| 14|    Hugh| 43| 49|
| 15|     Rom| 36| 49|
| 16|  Weyoun| 22|323|
| 17|     Odo| 35| 13|
| 18|Jean-Luc| 45|455|
| 19|  Geordi| 60|246|
+---+--------+---+---+
only showing top 20 rows



In [102]:
# Reading CSV files into a user-specificed custom schema
schema = StructType() \
    .add("CustomerID", IntegerType(), True) \
    .add('CustomerName', StringType(), True) \
    .add('Age',IntegerType(), True) \
    .add('Friends', IntegerType(), True)

df_with_schema = spark.read.format('csv') \
    .option('header', False) \
    .option('inferSchema', True) \
    .option('delimeter', ',') \
    .schema(schema) \
    .load('C:/SparkCourse/fakefriends.csv')


In [104]:
df_with_schema.show(100)

+----------+------------+---+-------+
|CustomerID|CustomerName|Age|Friends|
+----------+------------+---+-------+
|         0|        Will| 33|    385|
|         1|    Jean-Luc| 26|      2|
|         2|        Hugh| 55|    221|
|         3|      Deanna| 40|    465|
|         4|       Quark| 68|     21|
|         5|      Weyoun| 59|    318|
|         6|      Gowron| 37|    220|
|         7|        Will| 54|    307|
|         8|      Jadzia| 38|    380|
|         9|        Hugh| 27|    181|
|        10|         Odo| 53|    191|
|        11|         Ben| 57|    372|
|        12|       Keiko| 54|    253|
|        13|    Jean-Luc| 56|    444|
|        14|        Hugh| 43|     49|
|        15|         Rom| 36|     49|
|        16|      Weyoun| 22|    323|
|        17|         Odo| 35|     13|
|        18|    Jean-Luc| 45|    455|
|        19|      Geordi| 60|    246|
|        20|         Odo| 67|    220|
|        21|       Miles| 19|    268|
|        22|       Quark| 30|     72|
|        23|

In [112]:
# Write to a CSV file with pyspark
df_with_schema.write.option('header',True) \
    .option('delimter',' ') \
    .mode('overwrite') \
    .csv('C:/Users/14803/SparkTutorials/fakefriends')
    

In [120]:
simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]
schema = ['name','department','state','salary','age','bonus']
df = spark.createDataFrame(data= simpleData, schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------+----------+-----+------+---+-----+
|   name|department|state|salary|age|bonus|
+-------+----------+-----+------+---+-----+
|  James|     Sales|   NY| 90000| 34|10000|
|Michael|     Sales|   NY| 86000| 56|20000|
| Robert|     Sales|   CA| 81000| 30|23000|
|  Maria|   Finance|   CA| 90000| 24|23000|
|  Raman|   Finance|   CA| 99000| 40|24000|
|  Scott|   Finance|   NY| 83000| 36|19000|
|    Jen|   Finance|   NY| 79000| 53|15000|
|   Jeff| Marketing|   CA| 80000| 25|18000|
|  Kumar| Marketing|   NY| 91000| 50|21000|
+-------+----------+-----+------+---+-----+



In [125]:
# Group by department and salary
df.groupBy('department').sum('salary').show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+



In [126]:
# Calculate number of employees in each department
df.groupBy('department').count().show()

+----------+-----+
|department|count|
+----------+-----+
|     Sales|    3|
|   Finance|    4|
| Marketing|    2|
+----------+-----+



In [128]:
# Calculate min salary of each department and the max salary by state
df.groupby('department').min('salary').show()
df.groupby('state').max('salary').show()

+----------+-----------+
|department|min(salary)|
+----------+-----------+
|     Sales|      81000|
|   Finance|      79000|
| Marketing|      80000|
+----------+-----------+

+-----+-----------+
|state|max(salary)|
+-----+-----------+
|   CA|      99000|
|   NY|      91000|
+-----+-----------+



In [146]:
# Grouping by on multiple columns.  Group by department and state for total compensation.
df.groupby('department','state').sum('salary','bonus').show()

+----------+-----+-----------+----------+
|department|state|sum(salary)|sum(bonus)|
+----------+-----+-----------+----------+
|   Finance|   NY|     162000|     34000|
| Marketing|   NY|      91000|     21000|
|     Sales|   CA|      81000|     23000|
| Marketing|   CA|      80000|     18000|
|   Finance|   CA|     189000|     47000|
|     Sales|   NY|     176000|     30000|
+----------+-----+-----------+----------+



In [148]:
# Running more that one aggreate at a time.
df.groupby('department') \
    .agg(sum("salary").alias('Sum_Salary'), \
         avg("salary").alias('avg_salary'), \
         min('bonus').alias('min_bonus'), \
         max('bonus').alias('max_bonus'), \
         max('salary').alias('max_salary')).show()

+----------+----------+-----------------+---------+---------+----------+
|department|Sum_Salary|       avg_salary|min_bonus|max_bonus|max_salary|
+----------+----------+-----------------+---------+---------+----------+
|     Sales|    257000|85666.66666666667|    10000|    23000|     90000|
|   Finance|    351000|          87750.0|    15000|    24000|     99000|
| Marketing|    171000|          85500.0|    18000|    21000|     91000|
+----------+----------+-----------------+---------+---------+----------+



In [153]:
# Filtering on aggregate data
df.groupby('department') \
    .agg(sum("salary").alias('Sum_Salary'), \
         avg("salary").alias('avg_salary'), \
         min('bonus').alias('min_bonus'), \
         max('bonus').alias('max_bonus'), \
         max('salary').alias('max_salary')) \
        .where(col('min_bonus') >=15000).show()

+----------+----------+----------+---------+---------+----------+
|department|Sum_Salary|avg_salary|min_bonus|max_bonus|max_salary|
+----------+----------+----------+---------+---------+----------+
|   Finance|    351000|   87750.0|    15000|    24000|     99000|
| Marketing|    171000|   85500.0|    18000|    21000|     91000|
+----------+----------+----------+---------+---------+----------+



In [160]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ['name','department','salary']
df = spark.createDataFrame(simpleData, schema)

In [164]:
#approx_count_distinct returns the count of distinct in a group
x = df.select(approx_count_distinct('department'))
x.show()
df.show()
y = df.select(approx_count_distinct('department')).collect()[0][0]
print(str(y))

+---------------------------------+
|approx_count_distinct(department)|
+---------------------------------+
|                                3|
+---------------------------------+

+-------+----------+------+
|   name|department|salary|
+-------+----------+------+
|  James|     Sales|  3000|
|Michael|     Sales|  4600|
| Robert|     Sales|  4100|
|  Maria|   Finance|  3000|
|  James|     Sales|  3000|
|  Scott|   Finance|  3300|
|    Jen|   Finance|  3900|
|   Jeff| Marketing|  3000|
|  Kumar| Marketing|  2000|
|   Saif|     Sales|  4100|
+-------+----------+------+

3


In [168]:
# Collect LIST - returns all values from an input column with duplicates
df.select(collect_list('name')).show()
x = df.select(collect_list('name')).collect()
print(x[0][0])

+--------------------+
|  collect_list(name)|
+--------------------+
|[James, Michael, ...|
+--------------------+

['James', 'Michael', 'Robert', 'Maria', 'James', 'Scott', 'Jen', 'Jeff', 'Kumar', 'Saif']


In [169]:
# Collect_set - returns all values from an input column with duplicate values removed.
df.select(collect_list('name')).show()
x = df.select(collect_set('name')).collect()
print(x[0][0])

+--------------------+
|  collect_list(name)|
+--------------------+
|[James, Michael, ...|
+--------------------+

['Robert', 'Kumar', 'Jeff', 'Maria', 'Scott', 'Michael', 'Saif', 'Jen', 'James']
