In [None]:
import pandas as pd
import numpy as np

### 1. How to handle duplicate entries?

In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
data.duplicated()

In [None]:
sum(data.duplicated())

In [None]:
data.drop_duplicates()

In [None]:
data

In [None]:
data['v1'] = [1, 1, 3, 5, 6, 2, 7]
data

In [None]:
data.duplicated(['k2', 'v1'])

In [None]:
data.drop_duplicates(['k1'])

In [None]:
data.drop_duplicates(['k1', 'k2'])

In [None]:
data.drop_duplicates(['k1', 'k2'], keep='last')

### 2. Transforming values with a technique called mapping

In [None]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

In [None]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [None]:
lowercased = data['food'].str.lower()
lowercased

In [None]:
data['food'] = lowercased
data

In [None]:
data['animal'] = lowercased.map(meat_to_animal)
data

### 3. Replacing values

In [None]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

In [None]:
data.replace(-999, 999)

In [None]:
data.replace([-999, -1000], 999)

In [None]:
data.replace([-999, -1000], [999, 1000])

### 4. Axis Renaming

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

In [None]:
data.index

In [None]:
data.columns

In [None]:
data.index = data.index.map(lambda x : x.upper())
data

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

In [None]:
d0 = data.rename(columns=str.upper)
d0 

In [None]:
d1 = data.rename(index={'Ohio': 'California'}, columns={'one':1, 'two':2, 'three':3, 'four':4})
d1

In [None]:
data.columns = ('a', 'b', 'c', 'd')
data

### 5. Discreetization and Binning

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

In [None]:
cats = pd.cut(ages, bins)
cats

In [None]:
cats.codes

In [None]:
cats.categories

In [None]:
groups = ['youth', 'young adult', 'middle aged', 'senior']

In [None]:
cats = pd.cut(ages, bins, labels=groups)
cats

In [None]:
data = np.random.randn(1000)

In [None]:
min(data)

In [None]:
max(data)

In [None]:
cats = pd.qcut(data, 4)
cats

In [None]:
pd.value_counts(cats)

In [None]:
cats = pd.qcut(data, 10)
pd.value_counts(cats)

### 6. Outliers

In [None]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

In [None]:
data.head()

In [None]:
col = data[2]
col

In [None]:
min(col)

In [None]:
max(col)

In [None]:
(0.689721 - (-0.682376)) * 1.5 # Outlier values for column 2

In [None]:
data[data[2] > 2.05]

In [None]:
data[data[2] > 2.05].index

In [None]:
data.drop(data[data[2] > 2.05].index)

In [None]:
data.boxplot()