### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Definir schema de dataframes

#### Usando schema STRING

In [0]:
import datetime
usuario = [(1,
            'Corrie',
            'Van den Oord',
            'cvandenoord@etsy.com',
            True,
            1000.55, 
            datetime.date(2021, 1, 15), 
            datetime.datetime(2021, 2, 10, 1, 15)),
           (2,
            'Nikolas',
            'Brewitt',
            'nkbrewitt@gmail.com',
            True,
            900.0, 
            datetime.date(2021, 2, 14), 
            datetime.datetime(2021, 2, 18, 3, 33)),
           (3,
            'Oriel',
            'Penney',
            'openney@gmail.com',
            True,
            850.55, 
            datetime.date(2021, 1, 21), 
            datetime.datetime(2021, 3, 15, 15, 16, 55)),
           (4,
            'Ashby',
            'Maddocks',
            'amaddocks@gmail.com',
            False,
            None, 
            None, 
            datetime.datetime(2021, 4, 10, 17, 45, 30)),
           (5,
            'Kurt',
            'Rome',
            'krome@etsy.com',
            False,
            None, 
            None, 
            datetime.datetime(2021, 4, 2, 0, 55, 18))
          ]

In [0]:
usuario_schema = '''
    id INT,
    nombre STRING,
    apellido STRING,
    email STRING,
    es_cliente BOOLEAN,
    importe_abonado FLOAT,
    cliente_desde DATE,
    ultima_actualizacion TIMESTAMP
'''

In [0]:
df = spark.createDataFrame(usuario, schema=usuario_schema)

display(df)

id,nombre,apellido,email,es_cliente,importe_abonado,cliente_desde,ultima_actualizacion
1,Corrie,Van den Oord,cvandenoord@etsy.com,True,1000.55,2021-01-15,2021-02-10T01:15:00.000+0000
2,Nikolas,Brewitt,nkbrewitt@gmail.com,True,900.0,2021-02-14,2021-02-18T03:33:00.000+0000
3,Oriel,Penney,openney@gmail.com,True,850.55,2021-01-21,2021-03-15T15:16:55.000+0000
4,Ashby,Maddocks,amaddocks@gmail.com,False,,,2021-04-10T17:45:30.000+0000
5,Kurt,Rome,krome@etsy.com,False,,,2021-04-02T00:55:18.000+0000


#### Usando schema StructType

##### Ejemplo 1

In [0]:
from pyspark.sql.types import *

usuario_schema = StructType([
    StructField('id', IntegerType()),
    StructField('nombre', StringType()),
    StructField('apellido', StringType()),
    StructField('email', StringType()),
    StructField('es_cliente', BooleanType()),
    StructField('importe_abonado', FloatType()),
    StructField('cliente_desde', DateType()),
    StructField('ultima_actualizacion', TimestampType())
])

In [0]:
type(usuario_schema)

Out[13]: pyspark.sql.types.StructType

In [0]:
df = spark.createDataFrame(usuario, schema=usuario_schema)

display(df)

id,nombre,apellido,email,es_cliente,importe_abonado,cliente_desde,ultima_actualizacion
1,Corrie,Van den Oord,cvandenoord@etsy.com,True,1000.55,2021-01-15,2021-02-10T01:15:00.000+0000
2,Nikolas,Brewitt,nkbrewitt@gmail.com,True,900.0,2021-02-14,2021-02-18T03:33:00.000+0000
3,Oriel,Penney,openney@gmail.com,True,850.55,2021-01-21,2021-03-15T15:16:55.000+0000
4,Ashby,Maddocks,amaddocks@gmail.com,False,,,2021-04-10T17:45:30.000+0000
5,Kurt,Rome,krome@etsy.com,False,,,2021-04-02T00:55:18.000+0000


##### Ejemplo 2

In [0]:
from pyspark.sql.types import StringType, ArrayType, StructType, StructField

sample_data = [('Alfonso Pérez', ['C','C++','Python'],['Spark','C'],'RJ','DL'),
               ('Juan Rojas', ['Spark','C','C++'],['Go','C'],'MH','UK'),
               ('Pablo Herrera', ['Scala', 'Go'],['Spark','Matlab'],'AP','JH')
              ]

sample_schema = StructType([
    StructField('nombre', StringType(), True),
    StructField('lenguajes_aprendidos', ArrayType(StringType()),True),
    StructField('lenguajes_por_aprender', ArrayType(StringType()),True),
    StructField('Estado', StringType(), True),
    StructField('Estado_anterior', StringType(), True)
])



In [0]:
df.printSchema()

root
 |-- nombre: string (nullable = true)
 |-- lenguajes_aprendidos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- lenguajes_por_aprender: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Estado: string (nullable = true)
 |-- Estado_anterior: string (nullable = true)



In [0]:
df = spark.createDataFrame(sample_data, schema=sample_schema)

display(df)

nombre,lenguajes_aprendidos,lenguajes_por_aprender,Estado,Estado_anterior
Alfonso Pérez,"List(C, C++, Python)","List(Spark, C)",RJ,DL
Juan Rojas,"List(Spark, C, C++)","List(Go, C)",MH,UK
Pablo Herrera,"List(Scala, Go)","List(Spark, Matlab)",AP,JH
