<a href="https://colab.research.google.com/github/noaihere/encyclepedia_python/blob/main/strings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import re
import pandas as pd
import numpy as np

In [75]:
df = pd.DataFrame({'date':['May 2020','Jan 2019','Feb 2015', 'Total']})
df

Unnamed: 0,date
0,May 2020
1,Jan 2019
2,Feb 2015
3,Total


In [74]:
# str.split

df[['month', 'year']] = df.date.str.split(" ", expand=True,)
df

Unnamed: 0,date,month,year
0,May 2020,May,2020
1,Jan 2019,Jan,2019
2,Feb 2015,Feb,2015


In [76]:
# isin
# ~ means not

df[~df["date"].isin(["Total", "Grand totals", "Grand Total"])]

Unnamed: 0,date
0,May 2020
1,Jan 2019
2,Feb 2015


In [59]:
df = pd.DataFrame({'name':['ten1 ho','may sixt','nan'], 'email':['tenho@gmail.com', 'may-sixt@gmail.com', 'jlin.z@gmail.com'],
                   'product': ['a10 (cm)', 'b10 (m)', 'c10 (kg)']})
df

Unnamed: 0,name,email,product
0,ten1 ho,tenho@gmail.com,a10 (cm)
1,may sixt,may-sixt@gmail.com,b10 (m)
2,,jlin.z@gmail.com,c10 (kg)


In [12]:
# str.match like re.match

df['name'].str.match('\w+[1]')

0     True
1    False
2    False
Name: name, dtype: bool

In [42]:
# str.extract like re.search with groups

print(df['email'].str.extract('([A-Za-z-.]+)@'))
print(df['email'].str.extract('@([A-Za-z-.]+)'))
df[['first email', 'provider']] = df['email'].str.extract('([A-Za-z-.]+)@([A-Za-z-.]+)')

# ^ means not
df[["product_name", "unit"]] = df["product"].str.extract(
            r"([^(]+)(\([^\)]+\))?$")

# str strip method
df["unit"] = df["unit"].str.strip("()")
df

          0
0     tenho
1  may-sixt
2    jlin.z
           0
0  gmail.com
1  gmail.com
2  gmail.com


Unnamed: 0,name,email,product,first email,provider,product_name,unit
0,ten1 ho,tenho@gmail.com,a10 (cm),tenho,gmail.com,a10,cm
1,may sixt,may-sixt@gmail.com,b10 (m),may-sixt,gmail.com,b10,m
2,jess lingard,jlin.z@gmail.com,c10 (kg),jlin.z,gmail.com,c10,kg


In [43]:
# str.replace like sub

df["product"].str.replace(r"\(\w+\)", "(t)")

0    a10 (t)
1    b10 (t)
2    c10 (t)
Name: product, dtype: object

In [60]:
# can also use .replace, specify regex=True
df["name"].replace(r"^(nan|\s*)$", np.nan, regex=True, inplace =True)

# or use dictionary
df["name"].replace({"sixt": "six",
                    "\w+\d{1}": "ten"}, regex=True, inplace =True)
df

Unnamed: 0,name,email,product
0,ten ho,tenho@gmail.com,a10 (cm)
1,may six,may-sixt@gmail.com,b10 (m)
2,,jlin.z@gmail.com,c10 (kg)


In [70]:
# str.contains like re.search

df = pd.DataFrame({'Country':['Japan tet','America tet','et Asia'], 'Month':['Jul','Dec','Feb']
                   })
print(df['Country'].str.contains('Japan|Asia'))
america_index = df.index[df.iloc[:,0].str.contains("America")].tolist()[0]
print(america_index)

0     True
1    False
2     True
Name: Country, dtype: bool
1


In [79]:
# use apply to run function to change formatting and re.search with groups
# to extract quarter,year

df = pd.DataFrame({'date':['Q1 2020','Q2 2019','Q3 2015']})
df

quarter_re = re.compile(r"^Q([1234]) (\d{4})$")

def quarter_to_date(quarter):
    """ convert a string like 'Q3 2017' to a date """
    quarter, year = quarter_re.search(quarter).groups()
    return "{}-{:02d}-01".format(year, (int(quarter) * 3 - 2))

df.date = df.date.apply(quarter_to_date)
df.date = pd.to_datetime(df.date).dt.date
df

Unnamed: 0,date
0,2020-01-01
1,2019-04-01
2,2015-07-01
