In [41]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Data Transformation

#### 7.2.1 Removing Duplicates

In [42]:
import pandas as pd
import numpy as np

In [43]:
df_i = pd.DataFrame(
    {'k1': ['one', 'two'] * 3 + ['two'],
       'k2': [1, 1, 2, 3, 3, 4, 4]
    }
)
df_i

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [44]:
df_i.duplicated(keep = 'first')

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

> imp-point: drop_duplicates returns a DataFrame where duplicated array is False

In [45]:
df_i['v1'] = range(1, 8)
df_i

Unnamed: 0,k1,k2,v1
0,one,1,1
1,two,1,2
2,one,2,3
3,two,3,4
4,one,3,5
5,two,4,6
6,two,4,7


In [46]:
df_i.drop_duplicates(subset = ['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,1
1,two,1,2


In [47]:
df_i.drop_duplicates(subset = ['k1', 'k2'])

Unnamed: 0,k1,k2,v1
0,one,1,1
1,two,1,2
2,one,2,3
3,two,3,4
4,one,3,5
5,two,4,6


In [48]:
df_i.drop_duplicates(subset = ['k2', 'v1'])

Unnamed: 0,k1,k2,v1
0,one,1,1
1,two,1,2
2,one,2,3
3,two,3,4
4,one,3,5
5,two,4,6
6,two,4,7


#### 7.2.2 Transforming data using a function or mapping

In [49]:
df_ii = pd.DataFrame(
    {
    'food': [
            'bacon', 'pulled pork', 'bacon', 'Pastrami',
             'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'
            ],
    'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]
    })
df_ii

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [50]:
meat_to_animal = {
 'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'
}

In [51]:
df_ii_lowercased_meat_type = df_ii['food'].str.lower()
df_ii_lowercased_meat_type

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [52]:
df_ii['animal'] = df_ii_lowercased_meat_type.map(meat_to_animal)
df_ii

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [53]:
## We could have also passed function that does all the work
df_ii['animal_dup'] = df_ii['food'].map(lambda x: meat_to_animal[x.lower()])
df_ii

Unnamed: 0,food,ounces,animal,animal_dup
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,Pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,Bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon
