In [15]:
import pandas as pd
import numpy as np

# 1 Example with pandas
## 1.1 Split title and name

In [2]:
data = [{"name": "Mr Charlie dupont"},
        {"name": "Ms Alice Doe"},
        {"name": "Mme Sara Doe"},]

In [3]:
df=pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,name
0,Mr Charlie dupont
1,Ms Alice Doe
2,Mme Sara Doe


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
dtypes: object(1)
memory usage: 152.0+ bytes


In [12]:
df["title"]=df.name.str.split(" ").map(lambda x: x[0])
df["first_name"]=df.name.str.split(" ").map(lambda x: x[1])
df["last_name"]=df.name.str.split(" ").map(lambda x: x[-1])

In [13]:
df.head()

Unnamed: 0,name,title,first_name,last_name
0,Mr Charlie dupont,Mr,Charlie,dupont
1,Ms Alice Doe,Ms,Alice,Doe
2,Mme Sara Doe,Mme,Sara,Doe


In [18]:
df["sex"]=np.where(df['title']!= 'Mr', "F", "M")

In [19]:
df.head()

Unnamed: 0,name,title,first_name,last_name,sex
0,Mr Charlie dupont,Mr,Charlie,dupont,M
1,Ms Alice Doe,Ms,Alice,Doe,F
2,Mme Sara Doe,Mme,Sara,Doe,F


## 1.2 complex Split

Some time, the split can be more complicate. Consider the following example, the movie name contains also the year of the movie

In [20]:
data1=[{"name":"Toy Story (1995)"},
      {"name":"Grumpier Old Men (1996)"},
      {"name":"Waiting to Exhale (1997)"},
      {"name":"Father of the Bride Part II (1998)"}
      ]

In [21]:
pdf1=pd.DataFrame(data1)

In [22]:
pdf1.head()

Unnamed: 0,name
0,Toy Story (1995)
1,Grumpier Old Men (1996)
2,Waiting to Exhale (1997)
3,Father of the Bride Part II (1998)


In [23]:
pdf1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
dtypes: object(1)
memory usage: 160.0+ bytes


In [24]:
pdf1["Year"]=pdf1.name.str.split("(", n=1, expand=True)[1].str.split(")", n=1, expand=True)[0]

You can notice we have done two splits to parse the movie name.
For example, for string "Toy Story (1995)", the first split with "(", returns ["Toy Story (","1995)"]. We take the second items with index(1).
Then we do second split with ")", it returns ["1995",""], We take the first items with index(0)

In [25]:
pdf1.head()

Unnamed: 0,name,Year
0,Toy Story (1995),1995
1,Grumpier Old Men (1996),1996
2,Waiting to Exhale (1997),1997
3,Father of the Bride Part II (1998),1998


# 2 Example with pandas

## 2.1 Split title and name

In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import col,split,when,lit

In [3]:
local=True
if local:
    spark = SparkSession.builder\
        .master("local[4]")\
        .appName("Feature_grouping")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443")\
        .appName("Feature_grouping")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory","2g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

# make the large dataframe show pretty
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

Py4JError: An error occurred while calling None.org.apache.spark.sql.SparkSession. Trace:
py4j.Py4JException: Constructor org.apache.spark.sql.SparkSession([class org.apache.spark.SparkContext, class java.util.HashMap]) does not exist
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
	at py4j.Gateway.invoke(Gateway.java:237)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)



In [41]:
data = [{"name": "Mr Charlie dupont"},
        {"name": "Ms Alice Doe"},
        {"name": "Mme Sara Doe"},]

df=spark.createDataFrame(data=data)

In [42]:
df.show()

+-----------------+
|             name|
+-----------------+
|Mr Charlie dupont|
|     Ms Alice Doe|
|     Mme Sara Doe|
+-----------------+



In [43]:
df=df.withColumn("split_array", split(col("name")," "))

In [44]:
df.show()

+-----------------+--------------------+
|             name|         split_array|
+-----------------+--------------------+
|Mr Charlie dupont|[Mr, Charlie, dup...|
|     Ms Alice Doe|    [Ms, Alice, Doe]|
|     Mme Sara Doe|    [Mme, Sara, Doe]|
+-----------------+--------------------+



In [50]:
df=df.withColumn("title",col("split_array").getItem(0))\
    .withColumn("first_name",col("split_array").getItem(1))\
    .withColumn("last_name",col("split_array").getItem(2))

In [51]:
df.show()

+-----------------+--------------------+-----+----------+---------+
|             name|         split_array|title|first_name|last_name|
+-----------------+--------------------+-----+----------+---------+
|Mr Charlie dupont|[Mr, Charlie, dup...|   Mr|   Charlie|   dupont|
|     Ms Alice Doe|    [Ms, Alice, Doe]|   Ms|     Alice|      Doe|
|     Mme Sara Doe|    [Mme, Sara, Doe]|  Mme|      Sara|      Doe|
+-----------------+--------------------+-----+----------+---------+



In [55]:
df=df.withColumn("sex",when(df.title=="Mr",lit("M")).otherwise(lit("F")))

In [56]:
df.show()

+-----------------+--------------------+-----+----------+---------+---+
|             name|         split_array|title|first_name|last_name|sex|
+-----------------+--------------------+-----+----------+---------+---+
|Mr Charlie dupont|[Mr, Charlie, dup...|   Mr|   Charlie|   dupont|  M|
|     Ms Alice Doe|    [Ms, Alice, Doe]|   Ms|     Alice|      Doe|  F|
|     Mme Sara Doe|    [Mme, Sara, Doe]|  Mme|      Sara|      Doe|  F|
+-----------------+--------------------+-----+----------+---------+---+



## 2.2 complex Split

In [58]:
data1=[{"name":"Toy Story (1995)"},
      {"name":"Grumpier Old Men (1996)"},
      {"name":"Waiting to Exhale (1997)"},
      {"name":"Father of the Bride Part II (1998)"}
      ]

In [59]:
df1=spark.createDataFrame(data1)

In [61]:
df1.show(truncate=False)

+----------------------------------+
|name                              |
+----------------------------------+
|Toy Story (1995)                  |
|Grumpier Old Men (1996)           |
|Waiting to Exhale (1997)          |
|Father of the Bride Part II (1998)|
+----------------------------------+



In [72]:
df1=df1.withColumn("split1",split(col("name"), '\(' ).getItem(1))

Note, in spark, the split function takes three args:
1. target col names
2. separator pattern: which is a java regular expression.
3. limit



In [73]:
df1.show()

+--------------------+------+
|                name|split1|
+--------------------+------+
|    Toy Story (1995)| 1995)|
|Grumpier Old Men ...| 1996)|
|Waiting to Exhale...| 1997)|
|Father of the Bri...| 1998)|
+--------------------+------+



In [76]:
df1=df1.withColumn("year",split(col("split1"), '\)' ).getItem(0)).drop("split1")

In [78]:
df1.show(truncate=False)

+----------------------------------+----+
|name                              |year|
+----------------------------------+----+
|Toy Story (1995)                  |1995|
|Grumpier Old Men (1996)           |1996|
|Waiting to Exhale (1997)          |1997|
|Father of the Bride Part II (1998)|1998|
+----------------------------------+----+

