### Handling missing values

In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series(["red", "green", np.nan, "blue"])

In [4]:
s

0      red
1    green
2      NaN
3     blue
dtype: object

In [5]:
s.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
s[0] = None

In [7]:
s


0     None
1    green
2      NaN
3     blue
dtype: object

In [8]:
s.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [9]:
s.notnull()

0    False
1     True
2    False
3     True
dtype: bool

### Filter Missing Data

In [10]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])

In [11]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
data.dropna(axis=1)

0
1
2
3


In [18]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.788881,-0.622536,-1.004595
1,1.401004,-0.591761,0.756479
2,-1.729796,0.137143,0.875232
3,0.135509,-0.588091,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [19]:
df.iloc[:4, 1] = NA  # [row, col]
df.iloc[:2, 2] = NA

In [20]:
df

Unnamed: 0,0,1,2
0,-0.788881,,
1,1.401004,,
2,-1.729796,,0.875232
3,0.135509,,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [22]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.729796,,0.875232
3,0.135509,,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


### Fill in the missing data

In [23]:
df

Unnamed: 0,0,1,2
0,-0.788881,,
1,1.401004,,
2,-1.729796,,0.875232
3,0.135509,,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [24]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.788881,0.0,0.0
1,1.401004,0.0,0.0
2,-1.729796,0.0,0.875232
3,0.135509,0.0,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [25]:
df

Unnamed: 0,0,1,2
0,-0.788881,,
1,1.401004,,
2,-1.729796,,0.875232
3,0.135509,,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [26]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.788881,0.5,0.0
1,1.401004,0.5,0.0
2,-1.729796,0.5,0.875232
3,0.135509,0.5,0.970952
4,-1.645952,-1.987075,2.032141
5,-0.348537,-1.599274,2.319464
6,-0.375942,-0.134943,0.804107


In [27]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-2.387293,0.171826,-0.538946
1,-1.791694,0.36927,0.502201
2,0.274701,,-1.390956
3,0.423131,,1.434692
4,0.444138,,
5,1.948019,,


In [28]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-2.387293,0.171826,-0.538946
1,-1.791694,0.36927,0.502201
2,0.274701,0.36927,-1.390956
3,0.423131,0.36927,1.434692
4,0.444138,0.36927,1.434692
5,1.948019,0.36927,1.434692


In [29]:
df.fillna(method='ffill', limit = 2)

Unnamed: 0,0,1,2
0,-2.387293,0.171826,-0.538946
1,-1.791694,0.36927,0.502201
2,0.274701,0.36927,-1.390956
3,0.423131,0.36927,1.434692
4,0.444138,,1.434692
5,1.948019,,1.434692


In [31]:
df[1].mean()

0.2705481263625755

In [32]:
df.fillna({1: df[1].mean()})

Unnamed: 0,0,1,2
0,-2.387293,0.171826,-0.538946
1,-1.791694,0.36927,0.502201
2,0.274701,0.270548,-1.390956
3,0.423131,0.270548,1.434692
4,0.444138,0.270548,
5,1.948019,0.270548,


### Removing Duplicates

In [33]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [34]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [35]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [36]:
data['v1'] = range(7) # Add v1 column filled with values 0 - 7
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [37]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [38]:
data.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


In [39]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Values

In [40]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [41]:
lowercased = data['food'].str.lower()

In [42]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [45]:
data['food'] = lowercased

In [46]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [47]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [48]:
data['animal'] = lowercased.map(meat_to_animal)

In [49]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### Replacing Values

In [50]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [51]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [52]:
data.replace([-999, -1000], 0)

0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64

In [53]:
data.replace([-999, -1000], [999, 1000])

0       1.0
1     999.0
2       2.0
3     999.0
4    1000.0
5       3.0
dtype: float64

### Axis Renaming

In [54]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [57]:
data.index = data.index.map(lambda x : x.upper())

In [58]:
data


Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [59]:
data.columns

Index(['one', 'two', 'three', 'four'], dtype='object')

In [60]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [64]:
data.rename(index={'OHIO':'California'}, columns={'three': '3'}, inplace=True)

In [65]:
data

Unnamed: 0,one,two,3,four
California,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [66]:
data

Unnamed: 0,one,two,3,four
California,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


### Discretization and Binning

In [67]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [68]:
bins = [18, 25, 35, 60, 100]

In [69]:
cats = pd.cut(ages, bins)

In [70]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [71]:
len(cats), len(ages)

(12, 12)

In [73]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [74]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [75]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [76]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [78]:
data = np.random.rand(20)
data

array([0.39137358, 0.32562135, 0.17817503, 0.88418648, 0.46091752,
       0.0716216 , 0.0761589 , 0.18764468, 0.86025206, 0.052738  ,
       0.46102532, 0.52166744, 0.17514266, 0.90804297, 0.24375602,
       0.44976688, 0.19233297, 0.74206954, 0.11693724, 0.40725658])

In [79]:
pd.cut(data, 4, precision=2)

[(0.27, 0.48], (0.27, 0.48], (0.052, 0.27], (0.69, 0.91], (0.27, 0.48], ..., (0.27, 0.48], (0.052, 0.27], (0.69, 0.91], (0.052, 0.27], (0.27, 0.48]]
Length: 20
Categories (4, interval[float64]): [(0.052, 0.27] < (0.27, 0.48] < (0.48, 0.69] < (0.69, 0.91]]

In [80]:
data
min(data)

0.052737997482741106

In [81]:
max(data)

0.9080429729668491

In [82]:
data = np.random.randn(1000) 

In [83]:
cats = pd.qcut(data, 4)

In [84]:
cats

[(-3.056, -0.724], (-3.056, -0.724], (-0.00143, 0.641], (-0.724, -0.00143], (-0.00143, 0.641], ..., (-3.056, -0.724], (-3.056, -0.724], (-3.056, -0.724], (-0.00143, 0.641], (-0.00143, 0.641]]
Length: 1000
Categories (4, interval[float64]): [(-3.056, -0.724] < (-0.724, -0.00143] < (-0.00143, 0.641] < (0.641, 3.308]]

In [85]:
pd.value_counts(cats)

(0.641, 3.308]        250
(-0.00143, 0.641]     250
(-0.724, -0.00143]    250
(-3.056, -0.724]      250
dtype: int64

In [86]:
cats = pd.qcut(data, 10)

In [87]:
cats

[(-1.25, -0.876], (-3.056, -1.25], (-0.00143, 0.269], (-0.249, -0.00143], (0.269, 0.488], ..., (-3.056, -1.25], (-0.876, -0.547], (-0.876, -0.547], (-0.00143, 0.269], (0.269, 0.488]]
Length: 1000
Categories (10, interval[float64]): [(-3.056, -1.25] < (-1.25, -0.876] < (-0.876, -0.547] < (-0.547, -0.249] ... (0.269, 0.488] < (0.488, 0.794] < (0.794, 1.273] < (1.273, 3.308]]

In [88]:
pd.value_counts(cats)

(1.273, 3.308]        100
(0.794, 1.273]        100
(0.488, 0.794]        100
(0.269, 0.488]        100
(-0.00143, 0.269]     100
(-0.249, -0.00143]    100
(-0.547, -0.249]      100
(-0.876, -0.547]      100
(-1.25, -0.876]       100
(-3.056, -1.25]       100
dtype: int64

### Outliers

In [89]:
data = pd.DataFrame(np.random.randn(1000, cal = data[2]))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.052516,0.011445,0.055103,0.027236
std,1.000035,0.959096,1.00558,1.000849
min,-2.685299,-3.176574,-3.0317,-3.240697
25%,-0.653246,-0.654303,-0.614283,-0.616822
50%,0.065815,0.052543,0.038364,0.065011
75%,0.744633,0.642458,0.709707,0.679903
max,3.376638,2.814463,3.466778,3.555622


In [90]:
data.head()

Unnamed: 0,0,1,2,3
0,-0.420038,-0.902777,-0.501956,-0.636332
1,-0.420332,0.2823,1.914527,1.622519
2,-0.389003,-0.612554,-0.326241,-0.463935
3,0.85253,-0.593049,-0.303805,0.108105
4,0.536256,1.403874,0.392022,-0.316536


In [91]:
# Processing col 2
col = data[2]

In [92]:
col

0     -0.501956
1      1.914527
2     -0.326241
3     -0.303805
4      0.392022
         ...   
995   -1.636161
996    0.551222
997    0.477926
998    0.578428
999   -0.169835
Name: 2, Length: 1000, dtype: float64

In [93]:
col[np.abs(col) > 3]

159   -3.031700
472    3.029250
792    3.466778
Name: 2, dtype: float64

In [97]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
7,0.201727,1.267406,0.72348,-3.240697
159,-1.693286,1.299093,-3.0317,0.027632
166,0.819116,-3.070678,0.24397,-0.522025
251,3.334054,0.061102,-0.813408,-0.420633
275,3.376638,1.062267,0.711531,0.389533
327,-0.00771,0.020769,2.699066,3.555622
472,0.840976,-0.127505,3.02925,-2.03716
792,0.658525,0.242084,3.466778,-1.240321
796,0.907023,-3.176574,-1.937155,-0.723759
886,3.242616,1.07159,-0.306723,0.315886


In [98]:
data[(np.abs(data) > 3).any(1)].index

Int64Index([7, 159, 166, 251, 275, 327, 472, 792, 796, 886, 935], dtype='int64')