In [None]:
# Conclusion
# 1. Series.str, not for DataFrame
# 2. Series.str.xxx(), get str attribute then execute method
# 3. str.xxx() is not python built-in function, it's pandas method, but quite similar
# 4. Series.str.xxx() is vectorized, so it's faster than for loop
# 5. Series.str.xxx() is not in-place, so need to assign to a new column
# 6. Series.str.xxx() is not for DataFrame, so need to use DataFrame.applymap() or DataFrame.apply() to apply to DataFrame
# 7. Series.str is only for string type column, if not, need to convert to string type first, here is how to convert to string type
#    sales_df['product'] = sales_df['product'].astype(str)

import pandas as pd
sales_df = pd.read_csv("../dataset/sales_data.csv")
sales_df.head()

Unnamed: 0,Date,Region,Product,Sales,Profit,Cost,Customer_Segment
0,2024-01-01,North,Widget A,500,100,400,Enterprise
1,2024-01-02,South,Widget B,300,60,240,SMB
2,2024-01-03,West,Widget A,700,150,550,Enterprise
3,2024-01-04,East,Widget C,200,30,170,SMB
4,2024-01-05,North,Widget B,450,90,360,Enterprise


In [9]:
# 1. chagne to Chinese Date
def convert_to_chinese_date(row):
    year, month, day = row["Date"].split("-")
    return f"{year}年{month}月{day}日"
sales_df["Chinese_Date"] = sales_df.apply(convert_to_chinese_date, axis=1)
sales_df.head()

Unnamed: 0,Date,Region,Product,Sales,Profit,Cost,Customer_Segment,Chinese_Date
0,2024-01-01,North,Widget A,500,100,400,Enterprise,2024年01月01日
1,2024-01-02,South,Widget B,300,60,240,SMB,2024年01月02日
2,2024-01-03,West,Widget A,700,150,550,Enterprise,2024年01月03日
3,2024-01-04,East,Widget C,200,30,170,SMB,2024年01月04日
4,2024-01-05,North,Widget B,450,90,360,Enterprise,2024年01月05日


In [10]:
# 2. remove chinese character in Chinese_Date with under_score
# chain operation
sales_df["Chinese_Date_2"] = sales_df["Chinese_Date"].str.replace("年", "_").str.replace("月", "_").str.replace("日", "")
sales_df.head()

Unnamed: 0,Date,Region,Product,Sales,Profit,Cost,Customer_Segment,Chinese_Date,Chinese_Date_2
0,2024-01-01,North,Widget A,500,100,400,Enterprise,2024年01月01日,2024_01_01
1,2024-01-02,South,Widget B,300,60,240,SMB,2024年01月02日,2024_01_02
2,2024-01-03,West,Widget A,700,150,550,Enterprise,2024年01月03日,2024_01_03
3,2024-01-04,East,Widget C,200,30,170,SMB,2024年01月04日,2024_01_04
4,2024-01-05,North,Widget B,450,90,360,Enterprise,2024年01月05日,2024_01_05


In [12]:
# 3. regular expression
sales_df["Chinese_Date"].str.replace("[年月日]", "_").head()

0    2024年01月01日
1    2024年01月02日
2    2024年01月03日
3    2024年01月04日
4    2024年01月05日
Name: Chinese_Date, dtype: object