In [21]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [22]:
#========
# Setup
#========
df = pd.DataFrame({
    "customer": [" Scott " , "ella" , "RAJ" , None] ,
    "email": ["scott@ex.com" , "ella.s@ex.com" , "raj@ex.co" , "invalid_email"] ,
    "dept": ["Data Science" , "data-science" , "Analytics " , "Data  Science"] ,
    "tags": ["python,sql,r" , "tableau,sql" , None , "python,ml"] ,
    "note": ["Urgent!!! call back" , "ok" , "follow-up needed" , np.nan] ,
})
df

Unnamed: 0,customer,email,dept,tags,note
0,Scott,scott@ex.com,Data Science,"python,sql,r",Urgent!!! call back
1,ella,ella.s@ex.com,data-science,"tableau,sql",ok
2,RAJ,raj@ex.co,Analytics,,follow-up needed
3,,invalid_email,Data Science,"python,ml",


In [23]:
#===================================
# Case 1) Clean & standardize text
#===================================
case1 = df["customer"].astype("string").str.strip().str.title()
case1

0    Scott
1     Ella
2      Raj
3     <NA>
Name: customer, dtype: string

In [24]:
#=====================================
# Case 2) Search patterns (contains)
#=====================================
case2_mask = df["note"].astype("string").str.contains(r"urgent|follow-up" , case = False , na = False)
case2_mask
case2 = df.loc[case2_mask , ["customer" , "note"]]
case2

0     True
1    False
2     True
3    False
Name: note, dtype: boolean

Unnamed: 0,customer,note
0,Scott,Urgent!!! call back
2,RAJ,follow-up needed


In [25]:
#=============================================
# Case 3) Extract structured parts (extract)
#=============================================
case3 = df["email"].astype("string").str.extract(
    r"^(?P<user>[^@]+)@(?P<domain>[^@]+)$"
)
case3

Unnamed: 0,user,domain
0,scott,ex.com
1,ella.s,ex.com
2,raj,ex.co
3,,


In [26]:
#=====================================================
# Case 4) Split values into parts (split/get/expand)
#=====================================================
skills = df["tags"].astype("string")
skills

case4a = skills.str.split(",", expand=True)
case4a

case4b = skills.str.split(",").str.get(0)
case4b

0    python,sql,r
1     tableau,sql
2            <NA>
3       python,ml
Name: tags, dtype: string

Unnamed: 0,0,1,2
0,python,sql,r
1,tableau,sql,
2,,,
3,python,ml,


0     python
1    tableau
2       <NA>
3     python
Name: tags, dtype: object

In [27]:
#==================================================
# Case 5) Standardize values (replace with regex)
#==================================================
case5 = (
    df["dept"].astype("string")
    .str.strip()
    .str.lower()
    .str.replace(r"[-\s]+" , "_" , regex = True))
case5

0    data_science
1    data_science
2       analytics
3    data_science
Name: dept, dtype: string

In [28]:
#=================================
# Case 6) Build readable labels
#=================================
name_clean = df["customer"].astype("string").str.strip().fillna("unknown")
name_clean

dept_clean = df["dept"].astype("string").str.strip().fillna("unknown")
dept_clean

case6 = name_clean.str.cat(dept_clean, sep=" | ")
case6

0      Scott
1       ella
2        RAJ
3    unknown
Name: customer, dtype: string

0     Data Science
1     data-science
2        Analytics
3    Data  Science
Name: dept, dtype: string

0       Scott | Data Science
1        ella | data-science
2            RAJ | Analytics
3    unknown | Data  Science
Name: customer, dtype: string

In [29]:
#===================================
# Case 7) Turn text into features
#===================================
case7 = df["tags"].astype("string").str.get_dummies(sep = ",")
case7

Unnamed: 0,ml,python,r,sql,tableau
0,0,1,1,1,0
1,0,0,0,1,1
2,0,0,0,0,0
3,1,1,0,0,0


In [30]:
#========================================
# Cleaning column names using Index.str
#========================================
df2 = df.copy()
df2.columns = [" Customer Name " , "E-Mail " , " Dept" , "Tags " , "Note "]
df2.columns = (
    pd.Index(df2.columns)
      .str.strip()
      .str.lower()
      .str.replace(r"[-\s]+" , "_" , regex = True)
)

df2.columns.tolist()

['customer_name', 'e_mail', 'dept', 'tags', 'note']