### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Leer y definir schema de un archivo CSV

#### Leer un archivo CSV

##### Forma 1

In [0]:
df = spark.read.format("csv").\
    option("inferSchema",True).\
    option("header",True).\
    option("sep",",").\
    load("/FileStore/tables/AB_NYC_2019.csv")
df.show(n=3, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------
 id                             | 2539                                
 name                           | Clean & quiet apt home by the park  
 host_id                        | 2787                                
 host_name                      | John                                
 neighbourhood_group            | Brooklyn                            
 neighbourhood                  | Kensington                          
 latitude                       | 40.64749                            
 longitude                      | -73.97237                           
 room_type                      | Private room                        
 price                          | 149                                 
 minimum_nights                 | 1                                   
 number_of_reviews              | 9                                   
 last_review                    | 2018-10-19                          
 revie

##### Forma 2

In [0]:
df = spark.read.option("inferSchema","true") \
               .option("header","true") \
               .csv("/FileStore/tables/AB_NYC_2019.csv")
df.show(n=3, truncate=False, vertical=False)

+----+-----------------------------------+-------+---------+-------------------+-------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|id  |name                               |host_id|host_name|neighbourhood_group|neighbourhood|latitude|longitude|room_type      |price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|
+----+-----------------------------------+-------+---------+-------------------+-------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+
|2539|Clean & quiet apt home by the park |2787   |John     |Brooklyn           |Kensington   |40.64749|-73.97237|Private room   |149  |1             |9                |2018-10-19 |0.21             |6                             |365             |
|2595|Skylit

#### Leer múltiples archivos CSV

##### Forma 1

In [0]:
df = spark.read.format("csv").\
     option("inferSchema",True).\
     option("header",True).\
     option("sep",",").\
     load(['/FileStore/tables/by-day/2010_12_01.csv','/FileStore/tables/by-day/2010_12_02.csv'])

df.show(n=3, truncate=False, vertical=False)

+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                       |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER|6       |2010-12-01 08:26:00|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN               |6       |2010-12-01 08:26:00|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER    |8       |2010-12-01 08:26:00|2.75     |17850.0   |United Kingdom|
+---------+---------+----------------------------------+--------+-------------------+---------+----------+--------------+
only showing top 3 rows



##### Forma 2

In [0]:
df = spark.read.format("csv").\
     option("inferSchema",True).\
     option("header",True).\
     option("sep",",").\
     load(['/FileStore/tables/by-day/2010_12*.csv'])

# 20_12* tomará todos los archivos que contengan dicho prefijo
# 20_12_01, 20_12_02, 20_12_03, etc...

df.show(n=3, truncate=False, vertical=False)

+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                   |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+
|537226   |22811    |SET OF 6 T-LIGHTS CACTI       |6       |2010-12-06 08:34:00|2.95     |15987.0   |United Kingdom|
|537226   |21713    |CITRONELLA CANDLE FLOWERPOT   |8       |2010-12-06 08:34:00|2.1      |15987.0   |United Kingdom|
|537226   |22927    |GREEN GIANT GARDEN THERMOMETER|2       |2010-12-06 08:34:00|5.95     |15987.0   |United Kingdom|
+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+
only showing top 3 rows



#### Leer archivos CSV desde una carpeta

In [0]:
df = spark.read.format("csv").\
     option("inferSchema",True).\
     option("header",True).\
     option("sep",",").\
     load('/FileStore/tables/by-day/')

df.show(n=3, truncate=False, vertical=False)

+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                   |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+
|537226   |22811    |SET OF 6 T-LIGHTS CACTI       |6       |2010-12-06 08:34:00|2.95     |15987.0   |United Kingdom|
|537226   |21713    |CITRONELLA CANDLE FLOWERPOT   |8       |2010-12-06 08:34:00|2.1      |15987.0   |United Kingdom|
|537226   |22927    |GREEN GIANT GARDEN THERMOMETER|2       |2010-12-06 08:34:00|5.95     |15987.0   |United Kingdom|
+---------+---------+------------------------------+--------+-------------------+---------+----------+--------------+
only showing top 3 rows



#### Definir esquema de dataframe para un archivo CSV

##### Forma 1

In [0]:
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

SchemaModif = StructType([
  StructField("id", StringType(), True),
  StructField("name", StringType(), True),
  StructField("host_id", StringType(), True),
  StructField("host_name", StringType(), True),
  StructField("neighbourhood_group", StringType(), True),
  StructField("neighbourhood", StringType(), True),
  StructField("latitude", StringType(), True),
  StructField("longitude", StringType(), True),
  StructField("room_type", StringType(), True),
  StructField("price", IntegerType(), True),
  StructField("minimum_nights", IntegerType(), True),
  StructField("number_of_reviews", IntegerType(), True),
  StructField("last_review", StringType(), True),
  StructField("reviews_per_month", IntegerType(), True),
  StructField("calculated_host_listings_count", IntegerType(), True),
  StructField("availability_365", IntegerType(), True)
])

df = spark.read.format("csv").\
    schema(SchemaModif).\
    option("header",True).\
    option("sep",",").\
    load('/FileStore/tables/AB_NYC_2019.csv')

df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: integer (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)



##### Forma 2

In [0]:
SchemaModif_Alternativo = 'id STRING, name STRING, host_id STRING, host_name STRING, neighbourhood_group STRING, neighbourhood STRING, latitude STRING,    longitude STRING, room_type STRING, price INTEGER, minimum_nights INTEGER, number_of_reviews INTEGER, last_review STRING, reviews_per_month INTEGER, calculated_host_listings_count INTEGER, availability_365 INTEGER'

df = spark.read.format("csv").\
    schema(SchemaModif_Alternativo).\
    option("header",True).\
    option("sep",",").\
    load('/FileStore/tables/AB_NYC_2019.csv')

df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- minimum_nights: integer (nullable = true)
 |-- number_of_reviews: integer (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: integer (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- availability_365: integer (nullable = true)

