In [2]:
import pandas as pd
from typing import List

In [3]:
emp = pd.read_csv('data/employee.csv')

In [None]:
#Sorta mimics this: https://tidyr.tidyverse.org/reference/separate.html

In [4]:
def assign_split_col(df: pd.DataFrame, col: str, name_list: List[str], pat: str=None):
    df = df.copy()
    split_col = df[col].str.split(pat, expand=True)

    return df.assign(
        **dict(
            zip(name_list, [split_col.iloc[:, x] for x in range(split_col.shape[1])])
        )
    )

In [5]:
emp.pipe(
    assign_split_col,
    col="dept",
    name_list=["Dept Name", "Dept Acronym"],
    pat=r"[-(]",
).head()

Unnamed: 0,title,dept,salary,race,gender,hire_date,Dept Name,Dept Acronym
0,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Male,2015-02-03,Houston Police Department,HPD
1,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Male,1982-02-08,Houston Fire Department,HFD)
2,SENIOR POLICE OFFICER,Houston Police Department-HPD,66614.0,Black,Male,1984-11-26,Houston Police Department,HPD
3,ENGINEER,Public Works & Engineering-PWE,71680.0,Asian,Male,2012-03-26,Public Works & Engineering,PWE
4,CARPENTER,Houston Airport System (HAS),42390.0,White,Male,2013-11-04,Houston Airport System,HAS)


In [6]:
def assign_regex_col(df: pd.DataFrame, col: str, name_list: List[str], pat: str=None):
    df = df.copy()
    split_col = df[col].str.extract(pat, expand=True)

    return df.assign(
        **dict(
            zip(name_list, [split_col.iloc[:, x] for x in range(split_col.shape[1])])
        )
    )

In [7]:
emp.pipe(
    assign_regex_col,
    col="dept",
    name_list=["Dept Name", "Dept Acronym"],
    pat=r"([\w\s]*)[-(]([A-Z]*)",
).head()

Unnamed: 0,title,dept,salary,race,gender,hire_date,Dept Name,Dept Acronym
0,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Male,2015-02-03,Houston Police Department,HPD
1,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Male,1982-02-08,Houston Fire Department,HFD
2,SENIOR POLICE OFFICER,Houston Police Department-HPD,66614.0,Black,Male,1984-11-26,Houston Police Department,HPD
3,ENGINEER,Public Works & Engineering-PWE,71680.0,Asian,Male,2012-03-26,Engineering,PWE
4,CARPENTER,Houston Airport System (HAS),42390.0,White,Male,2013-11-04,Houston Airport System,HAS


In [8]:
from faker import Faker
import random

In [10]:
fake = Faker()

In [11]:
res_df = pd.DataFrame(
    {
        "Address": [fake.address() for _ in range(10)],
        "Residents": [
            ",".join(fake.name() for _ in range(random.randint(1, 3)))
            for _ in range(10)
        ],
        "N_Items_Ordered": [random.randint(1, 15) for _ in range(10)]
    }
)

In [12]:
def split_list_like(df: pd.DataFrame, col: str, new_col_prefix: str, pat: str = None):
    df = df.copy()
    split_col = df[col].str.split(pat, expand=True)

    return df.assign(
        **{
            f"{new_col_prefix}_{x}": split_col.iloc[:, x]
            for x in range(split_col.shape[1])
        }
    )

In [13]:
split = res_df.pipe(
    split_list_like,
    col="Residents",
    new_col_prefix="Resident",
    pat=",",
)

In [15]:
split.head()

Unnamed: 0,Address,Residents,N_Items_Ordered,Resident_0,Resident_1,Resident_2
0,"850 Allen Summit Apt. 467\nJeffreyshire, NJ 74637","Mrs. Brenda Thomas,Kim Ballard",14,Mrs. Brenda Thomas,Kim Ballard,
1,"37948 Michael Lakes\nBrowningfort, PA 75100",Ashley May,12,Ashley May,,
2,"99650 Shelby Run Suite 404\nPort Kyle, OH 80006","Brittany Kelly,Joshua Jordan",5,Brittany Kelly,Joshua Jordan,
3,"384 Ritter Brook\nNancyland, TN 91534","Richard Obrien,Catherine Nguyen,Jeffrey Webster",6,Richard Obrien,Catherine Nguyen,Jeffrey Webster
4,"378 Bush Lodge Suite 305\nNorth Cheryl, OH 48064","Jean Taylor,Donald Anderson",9,Jean Taylor,Donald Anderson,


In [14]:
long = pd.wide_to_long(split, stubnames='Resident', i="Address", j='nth_res',
                     sep='_')

In [16]:
long.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Residents,N_Items_Ordered,Resident
Address,nth_res,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"850 Allen Summit Apt. 467\nJeffreyshire, NJ 74637",0,"Mrs. Brenda Thomas,Kim Ballard",14,Mrs. Brenda Thomas
"37948 Michael Lakes\nBrowningfort, PA 75100",0,Ashley May,12,Ashley May
"99650 Shelby Run Suite 404\nPort Kyle, OH 80006",0,"Brittany Kelly,Joshua Jordan",5,Brittany Kelly
"384 Ritter Brook\nNancyland, TN 91534",0,"Richard Obrien,Catherine Nguyen,Jeffrey Webster",6,Richard Obrien
"378 Bush Lodge Suite 305\nNorth Cheryl, OH 48064",0,"Jean Taylor,Donald Anderson",9,Jean Taylor
