Based on [WafaStudies](https://www.youtube.com/@WafaStudies) PySpark [tutorial](https://www.youtube.com/playlist?list=PLMWaZteqtEaJFiJ2FyIKK0YEuXwQ9YIS_).

## Imports

In [1]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip -q install findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder\
                    .appName('Spark')\
                    .master("local[*]")\
                    .getOrCreate()

## Generate data

In [5]:
!pip install faker

Collecting faker
  Downloading Faker-19.10.0-py3-none-any.whl (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m18.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-19.10.0


In [6]:
!mkdir data
!mkdir ml_data

In [7]:
import json
import random
from faker import Faker

faker = Faker('pt_BR')

# Generate data for employees1.json
with open('data/employees1.json', 'w') as jsonfile:
    for id in range(1, 6):
        name = faker.name()
        salary = random.randint(1000, 10000)
        employee = {'id': id, 'name': name, 'salary': float(salary)}
        json.dump(employee, jsonfile)
        jsonfile.write('\n')

# Generate data for employees2.json
employees2_data = []
for id in range(1, 6):
    name = faker.name()
    salary = random.randint(1000, 10000)
    employee = {'id': id, 'name': name, 'salary': float(salary)}
    employees2_data.append(employee)

with open('ml_data/employees2.json', 'w') as jsonfile:
    json.dump(employees2_data, jsonfile, indent=2)

# Generate data for employees3.json
with open('data/employees3.json', 'w') as jsonfile:
    for id in range(6, 11):
        name = faker.name()
        salary = random.randint(1000, 10000)
        employee = {'id': id, 'name': name, 'salary': float(salary)}
        json.dump(employee, jsonfile)
        jsonfile.write('\n')

## Reading json files

In [8]:
help(spark.read.json)

Help on method json in module pyspark.sql.readwriter:

json(path: Union[str, List[str], pyspark.rdd.RDD[str]], schema: Union[pyspark.sql.types.StructType, str, NoneType] = None, primitivesAsString: Union[bool, str, NoneType] = None, prefersDecimal: Union[bool, str, NoneType] = None, allowComments: Union[bool, str, NoneType] = None, allowUnquotedFieldNames: Union[bool, str, NoneType] = None, allowSingleQuotes: Union[bool, str, NoneType] = None, allowNumericLeadingZero: Union[bool, str, NoneType] = None, allowBackslashEscapingAnyCharacter: Union[bool, str, NoneType] = None, mode: Optional[str] = None, columnNameOfCorruptRecord: Optional[str] = None, dateFormat: Optional[str] = None, timestampFormat: Optional[str] = None, multiLine: Union[bool, str, NoneType] = None, allowUnquotedControlChars: Union[bool, str, NoneType] = None, lineSep: Optional[str] = None, samplingRatio: Union[str, float, NoneType] = None, dropFieldIfAllNull: Union[bool, str, NoneType] = None, encoding: Optional[str] = 

In [9]:
df = spark.read.json("data/employees1.json")
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+---+--------------------+------+
| id|                name|salary|
+---+--------------------+------+
|  1|       Camila Aragão|5609.0|
|  2|    Fernando Cardoso|4326.0|
|  3|    Dra. Maitê Pinto|3183.0|
|  4|Srta. Bianca Almeida|8563.0|
|  5|   Catarina Nogueira|9706.0|
+---+--------------------+------+



We can also tell spark the schema:

In [10]:
schema = "id long, name string, salary double"

In [11]:
df = spark.read.json("data/employees1.json", schema=schema)
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+---+--------------------+------+
| id|                name|salary|
+---+--------------------+------+
|  1|       Camila Aragão|5609.0|
|  2|    Fernando Cardoso|4326.0|
|  3|    Dra. Maitê Pinto|3183.0|
|  4|Srta. Bianca Almeida|8563.0|
|  5|   Catarina Nogueira|9706.0|
+---+--------------------+------+



What if the json file is multiline?

In [12]:
with open('ml_data/employees2.json', 'r') as file:
    pretty_json = json.dumps(json.load(file), indent=4)
    print(pretty_json)

[
    {
        "id": 1,
        "name": "Manuela das Neves",
        "salary": 7116.0
    },
    {
        "id": 2,
        "name": "Maysa Lopes",
        "salary": 7583.0
    },
    {
        "id": 3,
        "name": "Ryan Nogueira",
        "salary": 6965.0
    },
    {
        "id": 4,
        "name": "Mariana Santos",
        "salary": 9330.0
    },
    {
        "id": 5,
        "name": "Kevin Barbosa",
        "salary": 4167.0
    }
]


Let's try to read it:

In [13]:
df = spark.read.json("ml_data/employees2.json")
df.printSchema()
df.show()

root
 |-- _corrupt_record: string (nullable = true)



AnalysisException: ignored

We got an error

To fix it, we need to tell spark the json file is multiline:

In [14]:
df = spark.read.json("ml_data/employees2.json", multiLine=True)
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+---+-----------------+------+
| id|             name|salary|
+---+-----------------+------+
|  1|Manuela das Neves|7116.0|
|  2|      Maysa Lopes|7583.0|
|  3|    Ryan Nogueira|6965.0|
|  4|   Mariana Santos|9330.0|
|  5|    Kevin Barbosa|4167.0|
+---+-----------------+------+



What if we want to load multiple json files?

Just use a list :)

In [15]:
df = spark.read.json(["data/employees1.json", "data/employees3.json"])
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+---+--------------------+------+
| id|                name|salary|
+---+--------------------+------+
|  1|       Camila Aragão|5609.0|
|  2|    Fernando Cardoso|4326.0|
|  3|    Dra. Maitê Pinto|3183.0|
|  4|Srta. Bianca Almeida|8563.0|
|  5|   Catarina Nogueira|9706.0|
|  6|    Lucca Nascimento|6946.0|
|  7|       Helena Novaes|6837.0|
|  8|      Clarice Castro|9509.0|
|  9|Carlos Eduardo Re...|3886.0|
| 10|        Luna Pereira|7069.0|
+---+--------------------+------+



If all the files are in the same folder, you can pass ```*.json```:

In [16]:
df = spark.read.json("data/*.json")
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+---+--------------------+------+
| id|                name|salary|
+---+--------------------+------+
|  1|       Camila Aragão|5609.0|
|  2|    Fernando Cardoso|4326.0|
|  3|    Dra. Maitê Pinto|3183.0|
|  4|Srta. Bianca Almeida|8563.0|
|  5|   Catarina Nogueira|9706.0|
|  6|    Lucca Nascimento|6946.0|
|  7|       Helena Novaes|6837.0|
|  8|      Clarice Castro|9509.0|
|  9|Carlos Eduardo Re...|3886.0|
| 10|        Luna Pereira|7069.0|
+---+--------------------+------+

