In [9]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

## Mapping

In [10]:
df = DataFrame({
    'altitude': [123, 234l, 345],
    'name': ['a', 'b', 'c']
})
df

Unnamed: 0,altitude,name
0,123,a
1,234,b
2,345,c


In [11]:
name_map = {
    'a': 'Mayur',
    'b': 'Naruto',
    'c': 'Sasuke'
}

In [12]:
# Will apply map name_map onto the name
# column and the values in the dict will
# be the values in the new column    
df['name'].map(name_map)

0     Mayur
1    Naruto
2    Sasuke
Name: name, dtype: object

## Replace

In [13]:
df = DataFrame({
    'a': [1, 2, 3, 4],
    'b': ['A', 'B', 'C', 'D'],
    'c': [3, 2, 1, 2]
})
df

Unnamed: 0,a,b,c
0,1,A,3
1,2,B,2
2,3,C,1
3,4,D,2


In [14]:
# Replace old value with new value
df.replace(1, 'Sasuke')

# If you want it to work only on a
# single column
df['a'] = df['a'].replace('1', 'Sasuke')
df

Unnamed: 0,a,b,c
0,Sasuke,A,3
1,2,B,2
2,3,C,1
3,4,D,2


## Renaming

In [15]:
df = DataFrame(np.arange(12).reshape(3, 4),
               index=['NY', 'LA', 'SF'],
               columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [16]:
# Can map indices
df.index.map(str.lower)
df

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [17]:
# Can pass dictionaries too
df.rename(index={
    'NY': 'New York',
    'LA': 'Los Angeles',
    'SF': 'San Francisco'
}, columns={
    'A': 'Alpha',
    'B': 'Beta',
}, inplace=True)  
# inplace=True will make it
# permanent, don't have to do df = ...
df

Unnamed: 0,Alpha,Beta,C,D
New York,0,1,2,3
Los Angeles,4,5,6,7
San Francisco,8,9,10,11


## Binning

In [18]:
years = [1990, 1991, 1992, 2008, 2012, 2015,
         1968, 1857]
decades_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020]

In [19]:
# Will assign each of the years into decade bin
# one bin for every year
decade_cat = pd.cut(years, decades_bins)

# this will tell you which year belongs to which
# category
decade_cat

# Get all the different categories
decade_cat.categories

# years which do no fall in any of the categories
# will be markes as NaN

Index([u'(1960, 1970]', u'(1970, 1980]', u'(1980, 1990]', u'(1990, 2000]',
       u'(2000, 2010]', u'(2010, 2020]'],
      dtype='object')

In [20]:
# Determines how many objects in each category
pd.value_counts(decade_cat)

(2010, 2020]    2
(1990, 2000]    2
(2000, 2010]    1
(1980, 1990]    1
(1960, 1970]    1
(1970, 1980]    0
dtype: int64

In [21]:
# Can specify the number of bins
pd.cut(years, 5, precision=2)

[(1983.4, 2015], (1983.4, 2015], (1983.4, 2015], (1983.4, 2015], (1983.4, 2015], (1983.4, 2015], (1951.8, 1983.4], (1856.84, 1888.6]]
Categories (5, object): [(1856.84, 1888.6] < (1888.6, 1920.2] < (1920.2, 1951.8] < (1951.8, 1983.4] < (1983.4, 2015]]

## Outliers

In [22]:
np.random.seed(12345)
df = DataFrame(np.random.randn(1000, 4))
df.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [23]:
col = df[0]
col.head()
col[np.abs(col) > 3]

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [24]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [25]:
# cap the outliers
df[df > 3] = 3

In [26]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.068051,0.067924,0.024912,-0.003226
std,0.996839,0.992106,1.004723,0.99356
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


## Random ordering

In [27]:
df = DataFrame(np.arange(16).reshape(4, 4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [28]:
blender = np.random.permutation(4)
blender

array([1, 3, 2, 0])

In [29]:
# from the df take it in the order
# specified by blender
df.take(blender)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3


In [30]:
box = np.array([1, 2, 3])
shaker = np.random.randint(0, len(box), size=10)
shaker

array([2, 0, 2, 0, 0, 2, 1, 1, 2, 1])

In [31]:
box.take(shaker)

array([3, 1, 3, 1, 1, 3, 2, 2, 3, 2])

In [32]:
# Elegant way
# frac is the fraction of sample
# you want in return, 1 = 100%
df.sample(frac=1)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7
