# Chapter 6: Working with text data

In [1]:
import pandas as pd

## 6.1 Letter casing and whitespace

In [2]:
inspections = pd.read_csv('data/ch06/chicago_food_inspections.csv')
inspections

Unnamed: 0,Name,Risk
0,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
1,JETS PIZZA,Risk 2 (Medium)
2,ROOM 1520,Risk 3 (Low)
3,MARRIOT MARQUIS CHICAGO,Risk 1 (High)
4,CHARTWELLS,Risk 1 (High)
...,...,...
153805,WOLCOTT'S,Risk 1 (High)
153806,DUNKIN DONUTS/BASKIN-ROBBINS,Risk 2 (Medium)
153807,Cafe 608,Risk 1 (High)
153808,mr.daniel's,Risk 1 (High)


In [3]:
inspections["Name"].head()

0     MARRIOT MARQUIS CHICAGO   
1                    JETS PIZZA 
2                     ROOM 1520 
3      MARRIOT MARQUIS CHICAGO  
4                  CHARTWELLS   
Name: Name, dtype: object

In [4]:
inspections["Name"].head().values

array([' MARRIOT MARQUIS CHICAGO   ', ' JETS PIZZA ', '   ROOM 1520 ',
       '  MARRIOT MARQUIS CHICAGO  ', ' CHARTWELLS   '], dtype=object)

In [5]:
inspections["Name"].str

<pandas.core.strings.accessor.StringMethods at 0x2460b766dd0>

In [6]:
dessert = "   cheesecake    "
dessert.lstrip()

'cheesecake    '

In [7]:
dessert.rstrip()

'   cheesecake'

In [8]:
dessert.strip()

'cheesecake'

In [9]:
inspections["Name"].str.lstrip().head()

0    MARRIOT MARQUIS CHICAGO   
1                   JETS PIZZA 
2                    ROOM 1520 
3     MARRIOT MARQUIS CHICAGO  
4                 CHARTWELLS   
Name: Name, dtype: object

In [10]:
inspections["Name"].str.rstrip().head()

0      MARRIOT MARQUIS CHICAGO
1                   JETS PIZZA
2                    ROOM 1520
3      MARRIOT MARQUIS CHICAGO
4                   CHARTWELLS
Name: Name, dtype: object

In [11]:
inspections["Name"].str.strip().head()

0    MARRIOT MARQUIS CHICAGO
1                 JETS PIZZA
2                  ROOM 1520
3    MARRIOT MARQUIS CHICAGO
4                 CHARTWELLS
Name: Name, dtype: object

In [12]:
inspections["Name"] = inspections["Name"].str.strip()

In [13]:
inspections.columns

Index(['Name', 'Risk'], dtype='object')

In [14]:
for column in inspections.columns:
    inspections[column] = inspections[column].str.strip()

In [15]:
inspections["Name"].str.lower().head()

0    marriot marquis chicago
1                 jets pizza
2                  room 1520
3    marriot marquis chicago
4                 chartwells
Name: Name, dtype: object

In [16]:
steaks = pd.Series(["porterhouse", "filet mignon", "ribeye"])
steaks

0     porterhouse
1    filet mignon
2          ribeye
dtype: object

In [17]:
steaks.str.upper()

0     PORTERHOUSE
1    FILET MIGNON
2          RIBEYE
dtype: object

In [18]:
inspections["Name"].str.capitalize().head()

0    Marriot marquis chicago
1                 Jets pizza
2                  Room 1520
3    Marriot marquis chicago
4                 Chartwells
Name: Name, dtype: object

In [19]:
inspections["Name"].str.title().head()

0    Marriot Marquis Chicago
1                 Jets Pizza
2                  Room 1520
3    Marriot Marquis Chicago
4                 Chartwells
Name: Name, dtype: object

## 6.2 String slicing

In [20]:
inspections["Risk"].head()

0      Risk 1 (High)
1    Risk 2 (Medium)
2       Risk 3 (Low)
3      Risk 1 (High)
4      Risk 1 (High)
Name: Risk, dtype: object

In [21]:
len(inspections)

153810

In [22]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', nan],
      dtype=object)

In [23]:
inspections = inspections.dropna(subset=["Risk"])

In [24]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All'],
      dtype=object)

In [25]:
inspections = inspections.replace(
    to_replace="All", value="Risk 4 (Extreme)"
)

In [26]:
inspections["Risk"].unique()

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)',
       'Risk 4 (Extreme)'], dtype=object)