In [42]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName('importação').getOrCreate()
spark

In [43]:
# Importando um CSV

schema = 'id INT, nome STRING, status STRING, cidade STRING, vendas INT, data DATE'
df_despachantes = (
    spark.read.format('csv')  
    .option('header', False)
    .option('delimiter', ',') 
    .schema(schema=schema)  
    .load('data/despachantes.csv')
)

df_despachantes.show(5)

+---+-------------------+------+-------------+------+----------+
| id|               nome|status|       cidade|vendas|      data|
+---+-------------------+------+-------------+------+----------+
|  1|   Carminda Pestana| Ativo|  Santa Maria|    23|2020-08-11|
|  2|    Deolinda Vilela| Ativo|Novo Hamburgo|    34|2020-03-05|
|  3|   Emídio Dornelles| Ativo| Porto Alegre|    34|2020-02-05|
|  4|Felisbela Dornelles| Ativo| Porto Alegre|    36|2020-02-05|
|  5|     Graça Ornellas| Ativo| Porto Alegre|    12|2020-02-05|
+---+-------------------+------+-------------+------+----------+
only showing top 5 rows



In [44]:
df_despachantes.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nome: string (nullable = true)
 |-- status: string (nullable = true)
 |-- cidade: string (nullable = true)
 |-- vendas: integer (nullable = true)
 |-- data: date (nullable = true)



In [45]:
# Importando Parquet

df_despachantes = (
    spark.read.format('parquet')
    .load('data/despachantes.parquet')
)

df_despachantes.printSchema()
df_despachantes.show(5)


root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: string (nullable = true)

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
+---+-------------------+-----+-------------+---+----------+
only showing top 5 rows



In [46]:
# Importando JSON

df_despachantes = (
    spark.read.format('json')
    .schema(schema=schema)
    .load('data/despachantes.json')
)

df_despachantes.printSchema()
df_despachantes.show(5)

root
 |-- id: integer (nullable = true)
 |-- nome: string (nullable = true)
 |-- status: string (nullable = true)
 |-- cidade: string (nullable = true)
 |-- vendas: integer (nullable = true)
 |-- data: date (nullable = true)

+---+-------------------+------+-------------+------+----------+
| id|               nome|status|       cidade|vendas|      data|
+---+-------------------+------+-------------+------+----------+
|  1|   Carminda Pestana| Ativo|  Santa Maria|    23|2020-08-11|
|  2|    Deolinda Vilela| Ativo|Novo Hamburgo|    34|2020-03-05|
|  3|   Emídio Dornelles| Ativo| Porto Alegre|    34|2020-02-05|
|  4|Felisbela Dornelles| Ativo| Porto Alegre|    36|2020-02-05|
|  5|     Graça Ornellas| Ativo| Porto Alegre|    12|2020-02-05|
+---+-------------------+------+-------------+------+----------+
only showing top 5 rows



In [50]:
# Importando ORC

df_despachantes = (
    spark.read.format('orc')
    .load('data/despachantes.orc')
)

df_despachantes.printSchema()
df_despachantes.show(5)

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: string (nullable = true)

+---+-------------------+-----+-------------+---+----------+
|_c0|                _c1|  _c2|          _c3|_c4|       _c5|
+---+-------------------+-----+-------------+---+----------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05|
+---+-------------------+-----+-------------+---+----------+
only showing top 5 rows

