### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Extrayendo strings con split

#### split

##### Ejemplo 1

In [0]:
from pyspark.sql.functions import *
# from pyspark.sql import functions as f

empleados = [(1, 'Scott', 'Tiger', 1000.0, 'United States', '+1 123 456 7890,+1 763 555 7890', '123 45 6789'),
             (2, 'Henry', 'Ford', 1250.0, 'India', '+91 234 567 8901', '456 78 9123'),
             (3, 'Nick', 'Menza', 750.0, 'United Kingdom', '+44 111 111 1111,+44 963 264 8677', '222 33 4444'),
             (4, 'Bill', 'Gates', 1500.0, 'Asutralia', '+61 987 654 3210,+61 287 445 7567', '789 12 6118')
            ]

df = spark.createDataFrame(empleados).toDF('id','nombre', 'apellido', 'salario', 'nacionalidad', 'telefono', 'ssn')
df.show(truncate=False)

+---+------+--------+-------+--------------+---------------------------------+-----------+
|id |nombre|apellido|salario|nacionalidad  |telefono                         |ssn        |
+---+------+--------+-------+--------------+---------------------------------+-----------+
|1  |Scott |Tiger   |1000.0 |United States |+1 123 456 7890,+1 763 555 7890  |123 45 6789|
|2  |Henry |Ford    |1250.0 |India         |+91 234 567 8901                 |456 78 9123|
|3  |Nick  |Menza   |750.0  |United Kingdom|+44 111 111 1111,+44 963 264 8677|222 33 4444|
|4  |Bill  |Gates   |1500.0 |Asutralia     |+61 987 654 3210,+61 287 445 7567|789 12 6118|
+---+------+--------+-------+--------------+---------------------------------+-----------+



In [0]:
df_split = df.select('id','nombre','telefono','ssn'). \
              withColumn('tel_split', explode(split('telefono', ',')))

df_split.show(truncate=False)

+---+------+---------------------------------+-----------+----------------+
|id |nombre|telefono                         |ssn        |tel_split       |
+---+------+---------------------------------+-----------+----------------+
|1  |Scott |+1 123 456 7890,+1 763 555 7890  |123 45 6789|+1 123 456 7890 |
|1  |Scott |+1 123 456 7890,+1 763 555 7890  |123 45 6789|+1 763 555 7890 |
|2  |Henry |+91 234 567 8901                 |456 78 9123|+91 234 567 8901|
|3  |Nick  |+44 111 111 1111,+44 963 264 8677|222 33 4444|+44 111 111 1111|
|3  |Nick  |+44 111 111 1111,+44 963 264 8677|222 33 4444|+44 963 264 8677|
|4  |Bill  |+61 987 654 3210,+61 287 445 7567|789 12 6118|+61 987 654 3210|
|4  |Bill  |+61 987 654 3210,+61 287 445 7567|789 12 6118|+61 287 445 7567|
+---+------+---------------------------------+-----------+----------------+



In [0]:
df_split.select('id','nombre','tel_split').\
         withColumn('area_code', split('tel_split',' ')[1].cast('int')).\
         withColumn('ultimos_4_digitos', split('tel_split',' ')[3].cast('int')).\
         show()

+---+------+----------------+---------+-----------------+
| id|nombre|       tel_split|area_code|ultimos_4_digitos|
+---+------+----------------+---------+-----------------+
|  1| Scott| +1 123 456 7890|      123|             7890|
|  1| Scott| +1 763 555 7890|      763|             7890|
|  2| Henry|+91 234 567 8901|      234|             8901|
|  3|  Nick|+44 111 111 1111|      111|             1111|
|  3|  Nick|+44 963 264 8677|      963|             8677|
|  4|  Bill|+61 987 654 3210|      987|             3210|
|  4|  Bill|+61 287 445 7567|      287|             7567|
+---+------+----------------+---------+-----------------+



##### Ejemplo 2

In [0]:
from pyspark.sql.functions import *

employee_data = [(10,"Michael Robinson","1999-06-01","100","2000"),
                 (20,"James Wood","2003-03-01","200","8000"),
                 (30,"Chris Andrews","2005-04-01","100","6000"),
                 (40,"Mark Bond","2008-10-01","100","7000"),
                 (50,"Steve Watson","1996-02-01","400","1000"),
                 (60,"Mathews Simon","1998-11-01","500","5000"),
                 (70,"Peter Paul","2011-04-01","1600","5000")]

employee_schema = ["employee_id","Name","doj","employee_dept_id","salary"]

empDF = spark.createDataFrame(data=employee_data, schema=employee_schema)

empDF.printSchema()
display(empDF)

root
 |-- employee_id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- doj: string (nullable = true)
 |-- employee_dept_id: string (nullable = true)
 |-- salary: string (nullable = true)



employee_id,Name,doj,employee_dept_id,salary
10,Michael Robinson,1999-06-01,100,2000
20,James Wood,2003-03-01,200,8000
30,Chris Andrews,2005-04-01,100,6000
40,Mark Bond,2008-10-01,100,7000
50,Steve Watson,1996-02-01,400,1000
60,Mathews Simon,1998-11-01,500,5000
70,Peter Paul,2011-04-01,1600,5000


In [0]:
df = empDF.withColumn('First_Name', split(empDF['Name'],' ').getItem(0))\
           .withColumn('Last_Name', split(empDF['Name'], ' ').getItem(1))
display(df)

employee_id,Name,doj,employee_dept_id,salary,First_Name,Last_Name
10,Michael Robinson,1999-06-01,100,2000,Michael,Robinson
20,James Wood,2003-03-01,200,8000,James,Wood
30,Chris Andrews,2005-04-01,100,6000,Chris,Andrews
40,Mark Bond,2008-10-01,100,7000,Mark,Bond
50,Steve Watson,1996-02-01,400,1000,Steve,Watson
60,Mathews Simon,1998-11-01,500,5000,Mathews,Simon
70,Peter Paul,2011-04-01,1600,5000,Peter,Paul


In [0]:
df = empDF.withColumn('First Name', split(empDF['Name'], ' ').getItem(0))\
           .withColumn('Last_Name', split(empDF['Name'],' ').getItem(1))\
           .withColumn('Joining_Year', split(empDF['doj'],'-').getItem(0))\
           .withColumn('Joining_Month', split(empDF['doj'],'-').getItem(1))\
           .withColumn('Joining_Day', split(empDF['doj'],'-').getItem(2))\
           .drop(empDF['Name'])\
           .drop(empDF['doj'])
display(df)

employee_id,employee_dept_id,salary,First Name,Last_Name,Joining_Year,Joining_Month,Joining_Day
10,100,2000,Michael,Robinson,1999,6,1
20,200,8000,James,Wood,2003,3,1
30,100,6000,Chris,Andrews,2005,4,1
40,100,7000,Mark,Bond,2008,10,1
50,400,1000,Steve,Watson,1996,2,1
60,500,5000,Mathews,Simon,1998,11,1
70,1600,5000,Peter,Paul,2011,4,1
