# String Manipulations

In [1]:
# Set display of Notebook to Full Size
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Functions

### Function to get First and Last Names

In [2]:
def get_first_last_name(s):
    INVALID_NAME_PARTS = ["mr ", "ms ", "mrs",
        "dr ", "jr ", "sir "]
    parts = s.lower().replace(".","").strip().split()
    parts = [p for p in parts
        if p not in INVALID_NAME_PARTS]
    if len(parts)==0:
        raise ValueError(
             "Name %s is formatted wrong " % s)
    first, last = parts[0], parts[-1]
    first = first[0].upper() + first[1:]
    last = last[0].upper() + last[1:]
    return first, last

###  Function to format Age

In [3]:
def format_age(s):
    chars = list(s) # list of characters
    digit_chars = [c for c in chars if c.isdigit()]
    return int("".join(digit_chars))

### Function to format Dates

In [4]:
def format_date(s):
    MONTH_MAP = {
        "jan": "01", "feb": "02", "may": "03"}
    s = s.strip().lower().replace(",", "")
    m, d, y = s.split()
    if len(y) == 2: y = "19" + y
    if len(d) == 1: d = "0" + d
    return y + "-" + MONTH_MAP[m[:3]] + "-" + d

## Main Section

In [5]:
# Import Packages as needed
import pandas as pd
import os

In [6]:
# Read inpuut files and load data to dataframes
df = pd.read_csv(os.path.join('chapter04_string_manipulation_file.tsv'), sep="|")

In [7]:
# Info on the input dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Name         3 non-null object
Age          3 non-null object
Birthdate    3 non-null object
dtypes: object(3)
memory usage: 152.0+ bytes


In [8]:
# Describe the input dataframe
df.describe()

Unnamed: 0,Name,Age,Birthdate
count,3,3,3
unique,3,3,3
top,Billy Ray Joel,74 Years,may 24 1941
freq,1,1,1


In [9]:
# Apply Functions to create new Attributes
df["First Name"] = df["Name"].apply(lambda s: get_first_last_name(s)[0])
df["Last Name"] = df["Name"].apply(lambda s: get_first_last_name(s)[1])
df["Age"] = df["Age"].apply(format_age)
df["Birthdate"] = df["Birthdate"].apply(format_date).astype(pd.datetime)

In [10]:
print(df)

                Name  Age   Birthdate First Name Last Name
0  Ms. Janice Joplin   65  1943-01-19         Ms    Joplin
1         Bob Dylan    74  1941-03-24        Bob     Dylan
2     Billy Ray Joel   66  1941-02-09      Billy      Joel
