In [1]:
import pyspark


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [4]:
people_json = 'people.json'

In [5]:
df = spark.read.json(people_json)

In [6]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [7]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [8]:
df.columns

['age', 'name']

In [9]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [10]:
import pandas as pd
from pyspark.sql.types import Row
from datetime import datetime
from pyspark import SparkContext as sc

In [11]:
sc

pyspark.context.SparkContext

In [12]:
import random
num_samples = 100000000

def inside(p):     
    x, y = random.random(), random.random()
    return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()
# parallelize() missing 1 required positional argument: 'c'

TypeError: parallelize() missing 1 required positional argument: 'c'

In [13]:
simple_ex = sc.parallelize([1, "alice", 50])
simple_ex

TypeError: parallelize() missing 1 required positional argument: 'c'

### df Schema

In [14]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [15]:
data_schema = [StructField('age', IntegerType(), True),
              StructField('name', StringType(), True)]

In [16]:
final_struct = StructType(fields=data_schema)

In [17]:
df = spark.read.json(people_json, schema=final_struct)

In [18]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [19]:
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



### Manipulating Columns

In [20]:
df.withColumn('double', df['age']*2)

DataFrame[age: int, name: string, double: int]

In [21]:
df.withColumnRenamed('age', 'my_new_age').show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



### Using SQL in Spark

In [22]:
df.createOrReplaceTempView('people')

In [23]:
results = spark.sql('SELECT * FROM people')

In [24]:
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [25]:
new_results = spark.sql("SELECT * FROM people WHERE age = 30")

In [26]:
new_results.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [27]:
spark.stop()

### Data Manipulation

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('ops').getOrCreate()

In [6]:
msft_stock = 'msft.csv' 

In [31]:
df = spark.read.csv(msft_stock, header = True, inferSchema = True)

In [32]:
df.show(10)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-02-03|170.429993|     174.5|170.399994|174.380005|172.552048|30149100|
|2020-02-04|177.139999|180.639999|176.309998|180.119995|178.231873|36433300|
|2020-02-05|184.029999|184.199997|178.410004|179.899994|178.014191|39186300|
|2020-02-06|180.970001|183.820007|180.059998|183.630005|181.705093|27751400|
|2020-02-07|182.850006|185.630005|182.479996|183.889999|181.962357|33529100|
|2020-02-10|183.580002|188.839996|    183.25|188.699997|186.721939|35844300|
|2020-02-11|190.649994|190.699997|     183.5|184.440002|182.506607|53159900|
|2020-02-12|185.580002|185.850006|181.850006|184.710007|182.773773|47062900|
|2020-02-13|183.080002|186.229996|182.869995|183.710007|181.784271|35295800|
|2020-02-14|    183.25|185.410004|182.649994|185.350006|183.407059|23149500|

In [33]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



### Filter 

In [34]:
df.filter('Close < 175').show(10)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-02-03|170.429993|     174.5|170.399994|174.380005|172.552048|30149100|
|2020-02-24|167.770004|174.550003|163.229996|170.889999|169.560516|68311100|
|2020-02-25|174.199997|174.839996|167.649994|168.070007|166.762451|68073300|
|2020-02-26|169.710007|173.259995|168.210007|170.169998|  168.8461|56206100|
|2020-02-27|163.320007|167.029999|157.979996|158.179993|156.949387|93174900|
|2020-02-28|152.410004|163.710007|     152.0|162.009995|160.749588|97073600|
|2020-03-02|165.309998|172.919998|162.309998|172.789993|171.445724|71030800|
|2020-03-03|173.800003|     175.0|162.259995|164.509995|163.230148|71677000|
|2020-03-04|168.490005|170.699997|165.619995|170.550003|169.223145|49814400|
|2020-03-05|166.050003|170.869995|165.690002|166.270004| 164.97644|47817300|

In [35]:
df.filter('Close < 175').select('Open').show(10)

+----------+
|      Open|
+----------+
|170.429993|
|167.770004|
|174.199997|
|169.710007|
|163.320007|
|152.410004|
|165.309998|
|173.800003|
|168.490005|
|166.050003|
+----------+
only showing top 10 rows



In [36]:
df.filter(df['Close'] > 175).show(10)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-02-04|177.139999|180.639999|176.309998|180.119995|178.231873|36433300|
|2020-02-05|184.029999|184.199997|178.410004|179.899994|178.014191|39186300|
|2020-02-06|180.970001|183.820007|180.059998|183.630005|181.705093|27751400|
|2020-02-07|182.850006|185.630005|182.479996|183.889999|181.962357|33529100|
|2020-02-10|183.580002|188.839996|    183.25|188.699997|186.721939|35844300|
|2020-02-11|190.649994|190.699997|     183.5|184.440002|182.506607|53159900|
|2020-02-12|185.580002|185.850006|181.850006|184.710007|182.773773|47062900|
|2020-02-13|183.080002|186.229996|182.869995|183.710007|181.784271|35295800|
|2020-02-14|    183.25|185.410004|182.649994|185.350006|183.407059|23149500|
|2020-02-18|185.610001|187.699997|     185.5|187.229996|185.267365|27792200|

In [37]:
df.filter(df['Close'] > 175).select('Volume').show(10)

+--------+
|  Volume|
+--------+
|36433300|
|39186300|
|27751400|
|33529100|
|35844300|
|53159900|
|47062900|
|35295800|
|23149500|
|27792200|
+--------+
only showing top 10 rows



In [38]:
df.filter((df['Close'] > 175) & (df['Open'] < 175)).show(10)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-04-16|174.300003|177.279999|172.899994|177.039993|175.662643|50479600|
|2020-04-29|173.220001|177.679993|171.880005|177.429993|176.049622|51286600|
|2020-05-04|174.490005|     179.0|173.800003|178.839996|177.448639|30372900|
+----------+----------+----------+----------+----------+----------+--------+



In [39]:
df.filter(df['Close'] > 175).select('Volume').show(10)

+--------+
|  Volume|
+--------+
|36433300|
|39186300|
|27751400|
|33529100|
|35844300|
|53159900|
|47062900|
|35295800|
|23149500|
|27792200|
+--------+
only showing top 10 rows



In [40]:
df.filter((df['Close'] < 175) & (df['Open'] > 175)).show()

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-04-27|176.589996|176.899994|173.300003|174.050003|172.695923|33194400|
|2020-04-28|175.589996|175.669998|169.389999|169.809998|168.488922|34392700|
|2020-05-01|175.800003|178.639999|174.009995|174.570007|173.211899|39370500|
+----------+----------+----------+----------+----------+----------+--------+



In [41]:
df.filter((df['Close'] < 200) & ~(df['Open'] > 200)).show(10)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-02-03|170.429993|     174.5|170.399994|174.380005|172.552048|30149100|
|2020-02-04|177.139999|180.639999|176.309998|180.119995|178.231873|36433300|
|2020-02-05|184.029999|184.199997|178.410004|179.899994|178.014191|39186300|
|2020-02-06|180.970001|183.820007|180.059998|183.630005|181.705093|27751400|
|2020-02-07|182.850006|185.630005|182.479996|183.889999|181.962357|33529100|
|2020-02-10|183.580002|188.839996|    183.25|188.699997|186.721939|35844300|
|2020-02-11|190.649994|190.699997|     183.5|184.440002|182.506607|53159900|
|2020-02-12|185.580002|185.850006|181.850006|184.710007|182.773773|47062900|
|2020-02-13|183.080002|186.229996|182.869995|183.710007|181.784271|35295800|
|2020-02-14|    183.25|185.410004|182.649994|185.350006|183.407059|23149500|

In [42]:
df.filter(df['Low'] == 181.850006).show()

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2020-02-12|185.580002|185.850006|181.850006|184.710007|182.773773|47062900|
+----------+----------+----------+----------+----------+----------+--------+



In [43]:
spark.stop()

### Groupby and Aggregates

In [44]:
from pyspark.sql import SparkSession

In [45]:
sales_info = 'sales_info.csv'

In [46]:
spark = SparkSession.builder.appName('agga').getOrCreate()

In [47]:
df = spark.read.csv(sales_info, header=True, inferSchema=True)

In [48]:
df.count()

12

In [49]:
df.show(10)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
+-------+-------+-----+
only showing top 10 rows



In [50]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



### Groupby apply

In [51]:
df.groupBy('company').max().show()

+-------+----------+
|company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [52]:
df.groupBy('Company').mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [53]:
df.groupBy('Company').count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [54]:
df.agg({'Sales':'sum'}).show()

+----------+
|sum(Sales)|
+----------+
|    4327.0|
+----------+



In [55]:
df.agg({'Sales':'max'}).show()

+----------+
|max(Sales)|
+----------+
|     870.0|
+----------+



In [56]:
group_data = df.groupBy('Company')

In [57]:
group_data.agg({'Sales':'max', 'Person':'min'}).show()

+-------+----------+-----------+
|Company|max(Sales)|min(Person)|
+-------+----------+-----------+
|   APPL|     750.0|      Chris|
|   GOOG|     340.0|    Charlie|
|     FB|     870.0|       Carl|
|   MSFT|     600.0|        Amy|
+-------+----------+-----------+



### Other Functions

In [58]:
from pyspark.sql.functions import countDistinct, avg, stddev

In [59]:
df.select(countDistinct('Sales')).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                   11|
+---------------------+



In [60]:
df.select(avg('Sales')).show()

+-----------------+
|       avg(Sales)|
+-----------------+
|360.5833333333333|
+-----------------+



### Format column and renaming 

In [61]:
df.select(avg('Sales').alias('Average Sales')).show()

+-----------------+
|    Average Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [62]:
from pyspark.sql.functions import format_number

In [63]:
sales_std = df.select(stddev('Sales'))

In [64]:
sales_std.select(format_number('stddev_samp(Sales)', 2).alias('std of sales')).show()

+------------+
|std of sales|
+------------+
|      250.09|
+------------+



In [65]:
df.orderBy('Sales',).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [66]:
df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



In [67]:
spark.stop()

### Missing Data

In [82]:
from pyspark.sql import SparkSession

In [83]:
spark = SparkSession.builder.appName('Miss').getOrCreate()

In [84]:
contains_null = 'ContainsNull.csv'

In [85]:
df = spark.read.csv(contains_null, header=True, inferSchema=True)

In [86]:
df.show(10)

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



### Dropping Null Values

In [87]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [88]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [89]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [90]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [91]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



### Fill Missing Values

In [92]:
df.na.fill('FILL VALUE').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [93]:
df.na.fill(999).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|999.0|
|emp2| null|999.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [94]:
df.na.fill('FILL VALUE', subset=['Name']).show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [95]:
from pyspark.sql.functions import mean

In [96]:
mean_val = df.select(mean(df['Sales'])).collect()

In [97]:
mean_sales = mean_val[0][0]

In [98]:
df.na.fill(mean_sales, subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [99]:
spark.stop()

### Dates and Timestamps

In [9]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName('dates').getOrCreate()

In [11]:
df = spark.read.csv(msft_stock, header=True, inferSchema=True)

In [12]:
df.select(['Date', 'Open']).show(5)

+----------+----------+
|      Date|      Open|
+----------+----------+
|2020-02-03|170.429993|
|2020-02-04|177.139999|
|2020-02-05|184.029999|
|2020-02-06|180.970001|
|2020-02-07|182.850006|
+----------+----------+
only showing top 5 rows



### Extract Date Information

In [14]:
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format)

In [15]:
df.select(dayofmonth(df['Date'])).show(5)

+----------------+
|dayofmonth(Date)|
+----------------+
|               3|
|               4|
|               5|
|               6|
|               7|
+----------------+
only showing top 5 rows



In [17]:
df.select(month(df['Date'])).show(5)

+-----------+
|month(Date)|
+-----------+
|          2|
|          2|
|          2|
|          2|
|          2|
+-----------+
only showing top 5 rows



In [18]:
df.select(hour(df['Date'])).show(5)

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 5 rows



In [19]:
df.select(year(df['Date'])).show(5)

+----------+
|year(Date)|
+----------+
|      2020|
|      2020|
|      2020|
|      2020|
|      2020|
+----------+
only showing top 5 rows



In [20]:
new_df = df.withColumn("Year", year(df['Date']))

In [21]:
new_df.show(5)

+----------+----------+----------+----------+----------+----------+--------+----+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|Year|
+----------+----------+----------+----------+----------+----------+--------+----+
|2020-02-03|170.429993|     174.5|170.399994|174.380005|172.552048|30149100|2020|
|2020-02-04|177.139999|180.639999|176.309998|180.119995|178.231873|36433300|2020|
|2020-02-05|184.029999|184.199997|178.410004|179.899994|178.014191|39186300|2020|
|2020-02-06|180.970001|183.820007|180.059998|183.630005|181.705093|27751400|2020|
|2020-02-07|182.850006|185.630005|182.479996|183.889999|181.962357|33529100|2020|
+----------+----------+----------+----------+----------+----------+--------+----+
only showing top 5 rows



In [24]:
result = new_df.groupby("Year").mean().select(['Year', 'avg(Close)'])

In [25]:
result_new = result.withColumnRenamed('avg(Close)', 'Average Closing Price')

In [26]:
result_new.show()

+----+---------------------+
|Year|Average Closing Price|
+----+---------------------+
|2020|   195.65357768103442|
|2021|         222.85899805|
+----+---------------------+



In [27]:
result_new.select(['Year', format_number('Average Closing Price', 2).alias('Avg Closing Price')]).show()

+----+-----------------+
|Year|Avg Closing Price|
+----+-----------------+
|2020|           195.65|
|2021|           222.86|
+----+-----------------+



In [28]:
spark.stop()