<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Creating-DataFrames" data-toc-modified-id="Creating-DataFrames-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Creating DataFrames</a></span><ul class="toc-item"><li><span><a href="#From-RDD" data-toc-modified-id="From-RDD-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>From RDD</a></span></li><li><span><a href="#From-Spark-Data-Sources" data-toc-modified-id="From-Spark-Data-Sources-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>From Spark Data Sources</a></span></li></ul></li><li><span><a href="#Inspect-Data" data-toc-modified-id="Inspect-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Inspect Data</a></span><ul class="toc-item"><li><span><a href="#Queries" data-toc-modified-id="Queries-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Queries</a></span></li></ul></li></ul></div>

# Load the libraries

In [1]:
# pyspark
import pyspark
spark = pyspark.sql.SparkSession.builder.appName('app').getOrCreate()

# sql
from pyspark.sql.functions import col as _col
from pyspark.sql.functions import udf

# @udf("integer") def myfunc(x,y): return x - y
# stddev format_number date_format, dayofyear, when
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import (mean as _mean, min as _min,
                                   max as _max, avg as _avg,
                                   when as _when
                                  )

from pyspark.sql.types import (StructField,StringType,
                               IntegerType, FloatType,
                               DoubleType,StructType)

from pyspark import SparkConf, SparkContext, SQLContext

sc = spark.sparkContext
sqlContext = SQLContext(sc) 
sqc = sqlContext
# spark_df = sqlContext.createDataFrame(pandas_df)

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('max_columns',100)

import time,os,json
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

[(x.__name__,x.__version__) for x in [np,pd,sns]]

[('numpy', '1.17.5'), ('pandas', '1.0.5'), ('seaborn', '0.10.1')]

# Creating DataFrames

## From RDD

In [3]:
%%bash
cat people.txt

Michael, 29
Andy, 30
Justin, 19


In [13]:
# infer schema

lines = sc.textFile('people.txt')
parts = lines.map(lambda l: l.split(','))
people = parts.map(lambda p: Row(name=p[0],
                                age=int(p[1])))

# peopledf = spark.createDataFrame(people)
# peopledf.show()

# Py4JJavaError

## From Spark Data Sources

In [15]:
sdf = spark.read.text("people.txt")
sdf.show()

+-----------+
|      value|
+-----------+
|Michael, 29|
|   Andy, 30|
| Justin, 19|
+-----------+



In [17]:
sdf = spark.read.json('people.json')
sdf = spark.read.load('people.json',format='json')
sdf.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [39]:
%%writefile customer.json
{"address": ["New York,10021,N"],"age":25,"firstName":"John","lastName":"Smith","phoneNumber": [["212 555-1234 hover"],["213 555-1234 hover"]]}
{"address":["New York,10021,N"],"age":21,"firstName":"Jane","lastName":"Doe","phoneNumber": [["322 888-1234, hover"],["323 888-1234, hover"]]}

Overwriting customer.json


In [41]:
sdf = spark.read.json("customer.json")
sdf.show(truncate=False)

+------------------+---+---------+--------+----------------------------------------------+
|address           |age|firstName|lastName|phoneNumber                                   |
+------------------+---+---------+--------+----------------------------------------------+
|[New York,10021,N]|25 |John     |Smith   |[[212 555-1234 hover], [213 555-1234 hover]]  |
|[New York,10021,N]|21 |Jane     |Doe     |[[322 888-1234, hover], [323 888-1234, hover]]|
+------------------+---+---------+--------+----------------------------------------------+



In [42]:
sdf = spark.read.json("customer.json")
sdf.toPandas()

Unnamed: 0,address,age,firstName,lastName,phoneNumber
0,"[New York,10021,N]",25,John,Smith,"[[212 555-1234 hover], [213 555-1234 hover]]"
1,"[New York,10021,N]",21,Jane,Doe,"[[322 888-1234, hover], [323 888-1234, hover]]"


In [38]:
pdf = pd.read_json('customer.json',lines=True)
pdf

Unnamed: 0,address,age,firstName,lastName,phoneNumber
0,"[New York,10021,N]",25,John,Smith,[[212 555-1234 hover]]
1,"[New York,10021,N]",21,Jane,Doe,"[[322 888-1234, hover]]"


# Inspect Data

In [43]:
sdf.dtypes

[('address', 'array<string>'),
 ('age', 'bigint'),
 ('firstName', 'string'),
 ('lastName', 'string'),
 ('phoneNumber', 'array<array<string>>')]

In [44]:
sdf.head()

Row(address=['New York,10021,N'], age=25, firstName='John', lastName='Smith', phoneNumber=[['212 555-1234 hover'], ['213 555-1234 hover']])

In [45]:
sdf.first()

Row(address=['New York,10021,N'], age=25, firstName='John', lastName='Smith', phoneNumber=[['212 555-1234 hover'], ['213 555-1234 hover']])

In [46]:
sdf.take(2)

[Row(address=['New York,10021,N'], age=25, firstName='John', lastName='Smith', phoneNumber=[['212 555-1234 hover'], ['213 555-1234 hover']]),
 Row(address=['New York,10021,N'], age=21, firstName='Jane', lastName='Doe', phoneNumber=[['322 888-1234, hover'], ['323 888-1234, hover']])]

In [47]:
sdf.schema

StructType(List(StructField(address,ArrayType(StringType,true),true),StructField(age,LongType,true),StructField(firstName,StringType,true),StructField(lastName,StringType,true),StructField(phoneNumber,ArrayType(ArrayType(StringType,true),true),true)))

In [48]:
sdf.printSchema()

root
 |-- address: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- age: long (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- phoneNumber: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)



In [49]:
## Duplicate Values
sdf = sdf.dropDuplicates()

## Queries

In [51]:
## Select

In [None]:
sdf.select("firstName")
sdf.select("firstName","lastName")

In [61]:
(sdf.select("firstName",
          "age",
           F.explode("phoneNumber").alias("contactInfo")
          )
     .select("contactInfo","firstName")
     .show(truncate=False)
)

+---------------------+---------+
|contactInfo          |firstName|
+---------------------+---------+
|[212 555-1234 hover] |John     |
|[213 555-1234 hover] |John     |
|[322 888-1234, hover]|Jane     |
|[323 888-1234, hover]|Jane     |
+---------------------+---------+

