## Letter Casing and Whitespace

In [1]:
import pandas as pd

In [2]:
inspections = pd.read_csv("chicago_food_inspections.csv")
inspections.head()

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JET'S PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)


In [3]:
inspections["Name"].head()

0     MARRIOT MARQUIS CHICAGO   
1                   JET'S PIZZA 
2                     ROOM 1520 
3      MARRIOT MARQUIS CHICAGO  
4                  CHARTWELLS   
Name: Name, dtype: object

In [4]:
inspections["Name"].head().values

array([' MARRIOT MARQUIS CHICAGO   ', " JET'S PIZZA ", '   ROOM 1520 ',
       '  MARRIOT MARQUIS CHICAGO  ', ' CHARTWELLS   '], dtype=object)

In [5]:
dessert = "  cheesecake  "
dessert.lstrip()

'cheesecake  '

In [6]:
dessert.rstrip()

'  cheesecake'

In [7]:
dessert.strip()

'cheesecake'

In [8]:
inspections["Name"].str

<pandas.core.strings.accessor.StringMethods at 0x7fadb201f910>

In [9]:
inspections["Name"].str.lstrip().head()

0    MARRIOT MARQUIS CHICAGO   
1                  JET'S PIZZA 
2                    ROOM 1520 
3     MARRIOT MARQUIS CHICAGO  
4                 CHARTWELLS   
Name: Name, dtype: object

In [10]:
inspections["Name"].str.rstrip().head()

0      MARRIOT MARQUIS CHICAGO
1                  JET'S PIZZA
2                    ROOM 1520
3      MARRIOT MARQUIS CHICAGO
4                   CHARTWELLS
Name: Name, dtype: object

In [11]:
inspections["Name"].str.strip().head()

0    MARRIOT MARQUIS CHICAGO
1                JET'S PIZZA
2                  ROOM 1520
3    MARRIOT MARQUIS CHICAGO
4                 CHARTWELLS
Name: Name, dtype: object

In [12]:
inspections["Name"] = inspections["Name"].str.strip()

In [13]:
inspections.columns

Index(['Name', 'Risk'], dtype='object')

In [14]:
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()

## Lowercase and Uppercase

In [15]:
inspections["Name"].str.lower().head()

0    marriot marquis chicago
1                jet's pizza
2                  room 1520
3    marriot marquis chicago
4                 chartwells
Name: Name, dtype: object

In [16]:
steaks = pd.Series(["porterhouse", "filet mignon", "ribeye"])
steaks

0     porterhouse
1    filet mignon
2          ribeye
dtype: object

In [17]:
steaks.str.upper()

0     PORTERHOUSE
1    FILET MIGNON
2          RIBEYE
dtype: object

In [18]:
inspections["Name"].str.capitalize().head()

0    Marriot marquis chicago
1                Jet's pizza
2                  Room 1520
3    Marriot marquis chicago
4                 Chartwells
Name: Name, dtype: object

In [19]:
inspections["Name"].str.title().head()

0    Marriot Marquis Chicago
1                Jet'S Pizza
2                  Room 1520
3    Marriot Marquis Chicago
4                 Chartwells
Name: Name, dtype: object

## String Slicing

In [20]:
inspections["Risk"].head()

0      Risk 1 (High)
1    Risk 2 (Medium)
2       Risk 3 (Low)
3      Risk 1 (High)
4      Risk 1 (High)
Name: Risk, dtype: object

In [21]:
len(inspections)

153810

In [22]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [23]:
inspections.dropna(subset = ["Risk"], inplace = True)

In [24]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

In [25]:
inspections.replace(
    to_replace = "All", value = "Risk 4 (Extreme)", inplace = True
)

In [26]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)

### String Slicing and Character Replacement

In [27]:
inspections["Risk"].str.slice(5, 6).head()

0    1
1    2
2    3
3    1
4    1
Name: Risk, dtype: object

In [28]:
inspections["Risk"].str[5:6].head()

0    1
1    2
2    3
3    1
4    1
Name: Risk, dtype: object

In [29]:
inspections["Risk"].str.slice(8).head()

0      High)
1    Medium)
2       Low)
3      High)
4      High)
Name: Risk, dtype: object

In [30]:
inspections["Risk"].str[8:].head()

0      High)
1    Medium)
2       Low)
3      High)
4      High)
Name: Risk, dtype: object

In [31]:
inspections["Risk"].str.slice(8, -1).head()

0      High
1    Medium
2       Low
3      High
4      High
Name: Risk, dtype: object

In [32]:
inspections["Risk"].str[8:-1].head()

0      High
1    Medium
2       Low
3      High
4      High
Name: Risk, dtype: object

## Boolean Methods

In [33]:
"Pizza" in "Jet's Pizza"

True

In [34]:
"pizza" in "Jet's Pizza"

False

In [35]:
inspections["Name"].str.lower().str.contains("pizza").head()

0    False
1     True
2    False
3    False
4    False
Name: Name, dtype: bool

In [36]:
has_pizza = inspections["Name"].str.lower().str.contains("pizza")
inspections[has_pizza]

Unnamed: 0,Name,Risk
1,JET'S PIZZA,Risk 2 (Medium)
19,NANCY'S HOME OF STUFFED PIZZA,Risk 1 (High)
27,"NARY'S GRILL & PIZZA ,INC.",Risk 1 (High)
29,NARYS GRILL & PIZZA,Risk 1 (High)
68,COLUTAS PIZZA,Risk 1 (High)
...,...,...
153756,ANGELO'S STUFFED PIZZA CORP,Risk 1 (High)
153764,COCHIAROS PIZZA #2,Risk 1 (High)
153772,FERNANDO'S MEXICAN GRILL & PIZZA,Risk 1 (High)
153788,REGGIO'S PIZZA EXPRESS,Risk 1 (High)


In [37]:
inspections["Name"].str.lower().str.startswith("tacos").head()

0    False
1    False
2    False
3    False
4    False
Name: Name, dtype: bool

In [38]:
starts_with_tacos = inspections["Name"].str.lower().str.startswith("tacos")
inspections[starts_with_tacos]

Unnamed: 0,Name,Risk
69,TACOS NIETOS,Risk 1 (High)
556,TACOS EL TIO 2 INC.,Risk 1 (High)
675,TACOS DON GABINO,Risk 1 (High)
958,TACOS EL TIO 2 INC.,Risk 1 (High)
1036,TACOS EL TIO 2 INC.,Risk 1 (High)
...,...,...
143587,TACOS DE LUNA,Risk 1 (High)
144026,TACOS GARCIA,Risk 1 (High)
146174,Tacos Place's 1,Risk 1 (High)
147810,TACOS MARIO'S LIMITED,Risk 1 (High)


In [39]:
ends_with_tacos = inspections["Name"].str.lower().str.endswith("tacos")
inspections[ends_with_tacos]

Unnamed: 0,Name,Risk
382,LAZO'S TACOS,Risk 1 (High)
569,LAZO'S TACOS,Risk 1 (High)
2652,FLYING TACOS,Risk 3 (Low)
3250,JONY'S TACOS,Risk 1 (High)
3812,PACO'S TACOS,Risk 1 (High)
...,...,...
151121,REYES TACOS,Risk 1 (High)
151318,EL MACHO TACOS,Risk 1 (High)
151801,EL MACHO TACOS,Risk 1 (High)
153087,RAYMOND'S TACOS,Risk 1 (High)


## Splitting Strings

In [40]:
customers = pd.read_csv("customers.csv")
customers.head()

Unnamed: 0,Name,Address
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire..."
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,..."
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495"
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991"
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7..."


In [41]:
customers["Name"].str.len().head()

0    13
1    17
2    15
3    19
4    14
Name: Name, dtype: int64

In [42]:
phone_number = "555-123-4567"
phone_number.split("-")

['555', '123', '4567']

In [43]:
# The two lines below are equivalent
customers["Name"].str.split(pat = " ").head()
customers["Name"].str.split(" ").head()

0           [Frank, Manning]
1       [Elizabeth, Johnson]
2         [Donald, Stephens]
3    [Michael, Vincent, III]
4          [Jasmine, Zamora]
Name: Name, dtype: object

In [44]:
customers["Name"].str.split(" ").str.len().head()

0    2
1    2
2    2
3    3
4    2
Name: Name, dtype: int64

In [45]:
customers["Name"].str.split(pat = " ", n = 1).head()

0          [Frank, Manning]
1      [Elizabeth, Johnson]
2        [Donald, Stephens]
3    [Michael, Vincent III]
4         [Jasmine, Zamora]
Name: Name, dtype: object

In [46]:
customers["Name"].str.split(pat = " ", n = 1).str.get(0).head()

0        Frank
1    Elizabeth
2       Donald
3      Michael
4      Jasmine
Name: Name, dtype: object

In [47]:
customers["Name"].str.split(pat = " ", n = 1).str.get(1).head()

0        Manning
1        Johnson
2       Stephens
3    Vincent III
4         Zamora
Name: Name, dtype: object

In [48]:
customers["Name"].str.split(pat = " ", n = 1).str.get(-1).head()

0        Manning
1        Johnson
2       Stephens
3    Vincent III
4         Zamora
Name: Name, dtype: object

In [49]:
customers["Name"].str.split(
    pat = " ", n = 1, expand = True
).head()

Unnamed: 0,0,1
0,Frank,Manning
1,Elizabeth,Johnson
2,Donald,Stephens
3,Michael,Vincent III
4,Jasmine,Zamora


In [50]:
customers[["First Name", "Last Name"]] = customers["Name"].str.split(
    pat = " ", n = 1, expand = True
)

In [51]:
customers

Unnamed: 0,Name,Address,First Name,Last Name
0,Frank Manning,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,Elizabeth Johnson,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,Donald Stephens,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,Michael Vincent III,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,Jasmine Zamora,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora
...,...,...,...,...
9956,Dana Browning,"762 Andrew Views Apt. 254, North Paul, New Mex...",Dana,Browning
9957,Amanda Anderson,"44188 Day Crest Apt. 901, Lake Marcia, Maine, ...",Amanda,Anderson
9958,Eric Davis,"73015 Michelle Squares, Watsonville, West Virg...",Eric,Davis
9959,Taylor Hernandez,"129 Keith Greens, Haleyfurt, Oklahoma, 98916",Taylor,Hernandez


In [52]:
customers.drop(labels = "Name", axis = "columns", inplace = True)

In [53]:
customers.head()

Unnamed: 0,Address,First Name,Last Name
0,"6461 Quinn Groves, East Matthew, New Hampshire...",Frank,Manning
1,"1360 Tracey Ports Apt. 419, Kyleport, Vermont,...",Elizabeth,Johnson
2,"19120 Fleming Manors, Prestonstad, Montana, 23495",Donald,Stephens
3,"441 Olivia Creek, Jimmymouth, Georgia, 82991",Michael,Vincent III
4,"4246 Chelsey Ford Apt. 310, Karamouth, Utah, 7...",Jasmine,Zamora


## Coding Challenge

In [54]:
customers["Address"].str.split(",").head()

0    [6461 Quinn Groves,  East Matthew,  New Hampsh...
1    [1360 Tracey Ports Apt. 419,  Kyleport,  Vermo...
2    [19120 Fleming Manors,  Prestonstad,  Montana,...
3    [441 Olivia Creek,  Jimmymouth,  Georgia,  82991]
4    [4246 Chelsey Ford Apt. 310,  Karamouth,  Utah...
Name: Address, dtype: object

In [55]:
customers["Address"].str.split(", ").head()

0    [6461 Quinn Groves, East Matthew, New Hampshir...
1    [1360 Tracey Ports Apt. 419, Kyleport, Vermont...
2    [19120 Fleming Manors, Prestonstad, Montana, 2...
3       [441 Olivia Creek, Jimmymouth, Georgia, 82991]
4    [4246 Chelsey Ford Apt. 310, Karamouth, Utah, ...
Name: Address, dtype: object

In [56]:
customers["Address"].str.split(", ", expand = True).head()

Unnamed: 0,0,1,2,3
0,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,19120 Fleming Manors,Prestonstad,Montana,23495
3,441 Olivia Creek,Jimmymouth,Georgia,82991
4,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


In [57]:
new_cols = ["Street", "City", "State", "Zip"]

customers[new_cols] = customers["Address"].str.split(
    pat = ", ", 
    expand = True
)

In [58]:
customers.drop(labels = "Address", axis = "columns").head()

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
0,Frank,Manning,6461 Quinn Groves,East Matthew,New Hampshire,16656
1,Elizabeth,Johnson,1360 Tracey Ports Apt. 419,Kyleport,Vermont,31924
2,Donald,Stephens,19120 Fleming Manors,Prestonstad,Montana,23495
3,Michael,Vincent III,441 Olivia Creek,Jimmymouth,Georgia,82991
4,Jasmine,Zamora,4246 Chelsey Ford Apt. 310,Karamouth,Utah,76252


In [59]:
del customers["Address"]

In [60]:
customers.tail()

Unnamed: 0,First Name,Last Name,Street,City,State,Zip
9956,Dana,Browning,762 Andrew Views Apt. 254,North Paul,New Mexico,28889
9957,Amanda,Anderson,44188 Day Crest Apt. 901,Lake Marcia,Maine,37378
9958,Eric,Davis,73015 Michelle Squares,Watsonville,West Virginia,3933
9959,Taylor,Hernandez,129 Keith Greens,Haleyfurt,Oklahoma,98916
9960,Sherry,Nicholson,355 Griffin Valley,Davidtown,New Mexico,17581


## A Note on Regular Expressions

In [61]:
customers["Street"].head()

0             6461 Quinn Groves
1    1360 Tracey Ports Apt. 419
2          19120 Fleming Manors
3              441 Olivia Creek
4    4246 Chelsey Ford Apt. 310
Name: Street, dtype: object

In [62]:
customers["Street"].str.replace("\d{4,}", "*", regex = True).head()

0             * Quinn Groves
1    * Tracey Ports Apt. 419
2           * Fleming Manors
3           441 Olivia Creek
4    * Chelsey Ford Apt. 310
Name: Street, dtype: object