### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Creación de Dataframes

#### Ejemplo 1

In [0]:
employee_data = [(10,"Raj","Kumar","1999","100","M",2000),
                 (20,"Rahul","Rajan","2002","200","f",2000),
                 (30,"Raghav","Manish","2010","100",None,2000),
                 (40,"Raja","Singh","2004","100","F",2000),
                 (50,"Rama","Krish","2008","400","M",2000),
                 (60,"Rasul","Kutty","2014","500","M",2000),
                 (70,"Kumar","Chand","2004","600","M",2000)
                ]
employee_schema = ["employee_id","first_name","last_name","doj",
                   "employee_dept_id","gender","salary"]

df = spark.createDataFrame(data=employee_data, schema=employee_schema)

df.printSchema()

root
 |-- employee_id: long (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- doj: string (nullable = true)
 |-- employee_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [0]:
display(df)

employee_id,first_name,last_name,doj,employee_dept_id,gender,salary
10,Raj,Kumar,1999,100,M,2000
20,Rahul,Rajan,2002,200,f,2000
30,Raghav,Manish,2010,100,,2000
40,Raja,Singh,2004,100,F,2000
50,Rama,Krish,2008,400,M,2000
60,Rasul,Kutty,2014,500,M,2000
70,Kumar,Chand,2004,600,M,2000


#### Ejemplo 2

In [0]:
import datetime

tabla = [(datetime.date(2022,6,3),'11','996462523','996462523','MOV220',172,1),
         (datetime.date(2022,6,3),'11','996462523','996462523','MOV220',112,1),
         (datetime.date(2022,6,3),'11','996462523','996462523','MOV220',12,1),
         (datetime.date(2022,6,3),'11','996462523','996462523','MOV220',194,1)
        ]

tabla_schema = '''
               fecha DATE,
               hora STRING,
               origen_usg STRING,
               destino_usg STRING,
               service_filter STRING,
               segundos_usg LONG,
               q_llamadas LONG
               '''

In [0]:
df = spark.createDataFrame(data=tabla, schema=tabla_schema)

df.printSchema()

root
 |-- fecha: date (nullable = true)
 |-- hora: string (nullable = true)
 |-- origen_usg: string (nullable = true)
 |-- destino_usg: string (nullable = true)
 |-- service_filter: string (nullable = true)
 |-- segundos_usg: long (nullable = true)
 |-- q_llamadas: long (nullable = true)



In [0]:
df.show()

+----------+----+----------+-----------+--------------+------------+----------+
|     fecha|hora|origen_usg|destino_usg|service_filter|segundos_usg|q_llamadas|
+----------+----+----------+-----------+--------------+------------+----------+
|2022-06-03|  11| 996462523|  996462523|        MOV220|         172|         1|
|2022-06-03|  11| 996462523|  996462523|        MOV220|         112|         1|
|2022-06-03|  11| 996462523|  996462523|        MOV220|          12|         1|
|2022-06-03|  11| 996462523|  996462523|        MOV220|         194|         1|
+----------+----+----------+-----------+--------------+------------+----------+



#### Ejemplo 3

In [0]:
df = spark.createDataFrame([('rojo',),('azul',),('verde',)], ['color'])

df.printSchema()

root
 |-- color: string (nullable = true)



In [0]:
df.show()

+-----+
|color|
+-----+
| rojo|
| azul|
|verde|
+-----+



#### Ejemplo 4

In [0]:
df = spark.createDataFrame([('23/01/2022 11:28:12',),('24/01/2022 10:58:32',)], ['date'])

df.printSchema()

root
 |-- date: string (nullable = true)



In [0]:
df.show()

+-------------------+
|               date|
+-------------------+
|23/01/2022 11:28:12|
|24/01/2022 10:58:32|
+-------------------+



In [0]:
from pyspark.sql.functions import to_timestamp

df_modif = df.withColumn('date', to_timestamp('date', 'dd/MM/yyyy HH:mm:ss'))

df_modif.printSchema()

root
 |-- date: timestamp (nullable = true)



In [0]:
df_modif.show()

+-------------------+
|               date|
+-------------------+
|2022-01-23 11:28:12|
|2022-01-24 10:58:32|
+-------------------+



#### Ejemplo 5

In [0]:
df = spark.createDataFrame([(1, ['blue', 'winter','cozy']),
                            (2, ['red', 'summer', 'fresh', 'cooling']),
                            (3, ['green', 'summer', 'travel'])],
                           ['item_id', 'atributos']
                          )

df.printSchema()

root
 |-- item_id: long (nullable = true)
 |-- atributos: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
df.show(truncate=False)

+-------+-----------------------------+
|item_id|atributos                    |
+-------+-----------------------------+
|1      |[blue, winter, cozy]         |
|2      |[red, summer, fresh, cooling]|
|3      |[green, summer, travel]      |
+-------+-----------------------------+

