In [1]:
import findspark
findspark.init()

In [2]:
# khai báo thư viện
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession

## create Dataframe from RDD

In [3]:
# create data
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

spark = SparkSession.builder.appName('Foundation').getOrCreate()
sc = SparkContext.getOrCreate()
rdd = sc.parallelize(data)
rdd.collect()

[('Java', '20000'), ('Python', '100000'), ('Scala', '3000')]

In [35]:
# 1.using toDF() function
dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()
dfFromRDD1.show()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [34]:
# 2. using createDataFrame() from SparkDataFrame
dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [33]:
from pyspark.sql import Row
rowData = map(lambda x: Row(*x), data) 
dfFromData3 = spark.createDataFrame(rowData,columns)
dfFromData3.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



In [36]:
# create DataFrame with schema
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]
schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



## Create data from data sources

In [43]:
# creating dataframe from CSV
df2 = spark.read.csv("C:/Users/ADMIN/Downloads/bank-data.csv")
df2.show()

+-------+---+------+----------+-------+-------+--------+---+--------+-----------+--------+----+
|    _c0|_c1|   _c2|       _c3|    _c4|    _c5|     _c6|_c7|     _c8|        _c9|    _c10|_c11|
+-------+---+------+----------+-------+-------+--------+---+--------+-----------+--------+----+
|     id|age|   sex|    region| income|married|children|car|save_act|current_act|mortgage| pep|
|ID12101| 48|FEMALE|INNER_CITY|17546.0|     NO|       1| NO|      NO|         NO|      NO| YES|
|ID12102| 40|  MALE|      TOWN|30085.1|    YES|       3|YES|      NO|        YES|     YES|  NO|
|ID12103| 51|FEMALE|INNER_CITY|16575.4|    YES|       0|YES|     YES|        YES|      NO|  NO|
|ID12104| 23|FEMALE|      TOWN|20375.4|    YES|       3| NO|      NO|        YES|      NO|  NO|
|ID12105| 57|FEMALE|     RURAL|50576.3|    YES|       0| NO|     YES|         NO|      NO|  NO|
|ID12106| 57|FEMALE|      TOWN|37869.6|    YES|       2| NO|     YES|        YES|      NO| YES|
|ID12107| 22|  MALE|     RURAL|8877.07| 

In [44]:
# creating dataframe from textfile
df2 = spark.read.csv("C:/Users/ADMIN/Downloads/code.txt")
df2.show()

+--------------------+
|                 _c0|
+--------------------+
|     <!DOCTYPE html>|
|<html xmlns="http...|
|<head runat="serv...|
+--------------------+



In [47]:
# creating from json file
df2 = spark.read.csv("C:/Users/ADMIN/Documents/MachineLearningBasic/Chuong_2/Data_Excercise/json_Data_example.json")
df2.head(10)

[Row(_c0='['),
 Row(_c0='  {'),
 Row(_c0='    "WHO": "Joe"'),
 Row(_c0='    "WEEK": ['),
 Row(_c0='      {'),
 Row(_c0='        "NUMBER": 3'),
 Row(_c0='        "EXPENSE": ['),
 Row(_c0='          {'),
 Row(_c0='            "WHAT": "Beer"'),
 Row(_c0='            "AMOUNT": 18.00')]

In [48]:
# convert to pandas
pandasDF = df2.toPandas()
print(pandasDF)

                  _c0
0                   [
1                   {
2        "WHO": "Joe"
3           "WEEK": [
4                   {
..                ...
153                 ]
154                 }
155                 ]
156                 }
157                 ]

[158 rows x 1 columns]


## create a Row object

In [50]:
from pyspark.sql import Row
row = Row("James", 40)
print(row[0] + "," + str(row[1]))

James,40


In [52]:
row=Row(name="Alice", age=11)
print(row.name, row.age) 

Alice 11


## Create custom class from Row

In [53]:
Person = Row("name", "age")
p1=Person("James", 40)
p2=Person("Alice", 35)
print(p1.name +","+p2.name)

James,Alice


In [56]:
from pyspark.sql import SparkSession, Row
spark = SparkSession.builder.appName('abc').getOrCreate()

data = [Row(name="James,,Smith",lang=["Java","Scala","C++"],state="CA"), 
    Row(name="Michael,Rose,",lang=["Spark","Java","C++"],state="NJ"),
    Row(name="Robert,,Williams",lang=["CSharp","VB"],state="NV")]

rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[Row(name='James,,Smith', lang=['Java', 'Scala', 'C++'], state='CA'),
 Row(name='Michael,Rose,', lang=['Spark', 'Java', 'C++'], state='NJ'),
 Row(name='Robert,,Williams', lang=['CSharp', 'VB'], state='NV')]

In [60]:
collData = rdd.collect()
for row in collData:
    print(row.name + "," + str(row.lang) + "," + row.state)

James,,Smith,['Java', 'Scala', 'C++'],CA
Michael,Rose,,['Spark', 'Java', 'C++'],NJ
Robert,,Williams,['CSharp', 'VB'],NV


## Create column class object

In [67]:
from pyspark.sql.functions import lit
colOj = lit("abc")

data = [("Huy",21),("Hung",20)]
df = spark.createDataFrame(data).toDF("name","age")
df.printSchema()

# using DataFrame Object
df.select(df.name, df.age).show()
# df.select(df['name']).show()

from pyspark.sql.functions import col
# using SQl col() function
df.select(col("name")).show()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+----+---+
|name|age|
+----+---+
| Huy| 21|
|Hung| 20|
+----+---+

+----+
|name|
+----+
| Huy|
|Hung|
+----+



## Select()


In [68]:
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]
columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.show(truncate=False)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+



### 1. Select Single & Multiple Columns From PySpark

In [69]:
df.select("firstname","lastname").show()
# df.select(df.firstname,df.lastname).show()
# df.select(df["firstname"],df["lastname"]).show()

#By using col() function
from pyspark.sql.functions import col
df.select(col("firstname"),col("lastname")).show()

#Select columns by regular expression
df.select(df.colRegex("`^.*name*`")).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
|    Maria|   Jones|
+---------+--------+



### Select All Columns From List

In [71]:
# Select All columns from List
df.select(*columns).show()

# Select All columns
# df.select([col for col in df.columns]).show()
# df.select("*").show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



### Select column by Index

In [73]:
df.select(df.columns[:3]).show(3)
df.select(df.columns[2:4]).show(3)

+---------+--------+-------+
|firstname|lastname|country|
+---------+--------+-------+
|    James|   Smith|    USA|
|  Michael|    Rose|    USA|
|   Robert|Williams|    USA|
+---------+--------+-------+
only showing top 3 rows

+-------+-----+
|country|state|
+-------+-----+
|    USA|   CA|
|    USA|   NY|
|    USA|   CA|
+-------+-----+
only showing top 3 rows



## DataFrame filter() with Column Condition

In [4]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, ArrayType

data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]

schema = StructType([
    StructField('name',StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [5]:
df.filter(df.state == "OH").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [6]:
#Using SQL col() function
from pyspark.sql.functions import col
df.filter(col("state") == "OH") \
    .show(truncate=False) 

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [7]:
#Using SQL Expression
df.filter("gender == 'M'").show()
#For not equal
df.filter("gender != 'M'").show()
df.filter("gender <> 'M'").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|   OH|     F|
+-------------------+------------------+-----+------+

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Julia, , William

## Filter with Multiple Conditions

In [8]:
df.filter((df.state == "OH") & (df.gender == "F")).show()

+-------------------+------------+-----+------+
|               name|   languages|state|gender|
+-------------------+------------+-----+------+
|{Julia, , Williams}|[CSharp, VB]|   OH|     F|
+-------------------+------------+-----+------+



In [9]:
#Filter IS IN List values
li=["OH","CA","DE"]
df.filter(df.state.isin(li)).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [10]:
# Filter NOT IS IN List values
#These show all records with NY (NY is not part of the list)
df.filter(~df.state.isin(li)).show()
# df.filter(df.state.isin(li)==False).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [11]:
# Using startswith
df.filter(df.state.startswith("N")).show()

#using endswith
df.filter(df.state.endswith("H")).show()

#contains
df.filter(df.state.contains("H")).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia

In [12]:
# filter like and rlike

data2 = [(2,"Michael Rose"),(3,"Robert Williams"),
     (4,"Rames Rose"),(5,"Rames rose")
  ]
df2 = spark.createDataFrame(data = data2, schema = ["id","name"])

# like - SQL LIKE pattern
df2.filter(df2.name.like("%rose%")).show()

# rlike - SQL RLIKE pattern (LIKE with Regex)
#This check case insensitive
df2.filter(df2.name.rlike("(?i)^*rose$")).show()

+---+----------+
| id|      name|
+---+----------+
|  5|Rames rose|
+---+----------+

+---+------------+
| id|        name|
+---+------------+
|  2|Michael Rose|
|  4|  Rames Rose|
|  5|  Rames rose|
+---+------------+



In [13]:
#  Filter on an Array column
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False) 

+----------------+------------------+-----+------+
|name            |languages         |state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }  |[Spark, Java, C++]|NY   |F     |
+----------------+------------------+-----+------+



In [14]:
# Filtering on Nested Struct columns
df.filter(df.name.lastname == 'Williams').show()

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
| {Julia, , Williams}|[CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|[Python, VB]|   OH|     M|
+--------------------+------------+-----+------+

