In [1]:
# Imports
import pandas as pd
import numpy as np

# `.str` attribute in Pandas Data Structure
Allows vectorised operations on Series or df without errors even if there are `None` values
1. Alignment: `ljust()`, `rjust()`, `center()`, `strip()`, `rstrip()`
2. Find words: `find()`, `rfind()`, `index()`, `rindex()`, `startswith()`, `endswith()`
3. Modify words: `capitalize()`, `swapcase()`, `translate()`, `lower()`
4. Test: `isupper()`, `istitle()`
5. Tokenize: `split()`, `partition()`
6. Regex: `count()`, `replace()`, `match()`

In [3]:
# capitalise each word
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
try:
    [s.capitalize() for s in data]
except AttributeError:
    names = pd.Series(data)
    print(names.str.capitalize())

0    PETER
1     PAUL
2     None
3     MARY
4    GUIDO
dtype: object


In [5]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [11]:
monte.str.lower()
monte.str.startswith('T')
monte.str.split()

# get first name
print(monte.str.extract('([A-Za-z]+)')) # capture group
# find names that start and end with a consonant
print(monte.str.findall(r'^[^AEIOU].*[^aeiou]$'))

# Indexing
monte.str[0:3]
monte.str.split().str.get(-1)

         0
0   Graham
1     John
2    Terry
3     Eric
4    Terry
5  Michael
0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object


0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

# `get_dummies()` to get df from a column that stores info by separator

In [12]:
# get_dummies() to split info in columns!
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
                           'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [13]:
x = full_monte['info'].str.get_dummies('|')
x

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


# Mock Test

In [42]:
try:
    recipes = pd.read_json('recipeitems-latest.json')
except ValueError as e:
    print("ValueError:", e)

ValueError: Expected object or value


In [28]:
%%bash
# cannot be run anymore as file is non-existent
curl http://openrecipes.s3.amazonaws.com/recipeitems-latest.json.gz
gunzip recipeitems-latest.json.gz

‹ 3[          

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0    20    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100    20  100    20    0     0     28      0 --:--:-- --:--:-- --:--:-- 20000


In [None]:
# cannot be run as file is no longer there
# Assume that the ingredient is a field that has many words indicating the ingredients
# find the recipe name that has the longest ingredient list
recipes.name[np.argmax(recipes.ingredients.str.len())]

# find the number of recipes that require cinnamon
recipes.ingredients.str.contains('[Cc]innamon').sum()

# build a dataframe where for each row (recipe), whether the ingredient (column) is present
spice_df = pd.DataFrame(dict((spice, recipes.ingredients.str.contains(spice, re.IGNORECASE))
                        for spice in spice_list))
# then get those recipes that require parsley, paprika and tarragon and find the number
selection = spice_df.query('parsley & paprika & tarragon')
len(selection)