# Missing Data

In [2]:
import numpy as np
import pandas as pd

In [3]:
states = pd.Series(['alabama', 'california', np.nan, 'minnesota', 'virginia'])
states

0       alabama
1    california
2           NaN
3     minnesota
4      virginia
dtype: object

In [4]:
states.isnull()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [5]:
states.isnull().any()

True

In [6]:
states.isnull().sum()

1

In [7]:
states[2] = None

In [8]:
states.isnull()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [9]:
states.dropna()

0       alabama
1    california
3     minnesota
4      virginia
dtype: object

In [16]:
states.notnull()

0     True
1     True
2    False
3     True
4     True
dtype: bool

In [11]:
states[states.notnull()]

0       alabama
1    california
3     minnesota
4      virginia
dtype: object

In [21]:
?.dropna()

Object `.dropna` not found.


In [22]:
data = pd.DataFrame([
    ['alabama', np.NAN, np.NAN],
    ['california', 39536653, 1246],
    ['minnesota', 5576606, 169],
    [np.NAN, np.NAN, np.NAN],
    ['virginia', 8470020, 222]
])

In [23]:
data

Unnamed: 0,0,1,2
0,alabama,,
1,california,39536653.0,1246.0
2,minnesota,5576606.0,169.0
3,,,
4,virginia,8470020.0,222.0


In [24]:
data.dropna() # drop any row containing NaN

Unnamed: 0,0,1,2
1,california,39536653.0,1246.0
2,minnesota,5576606.0,169.0
4,virginia,8470020.0,222.0


In [25]:
data.dropna(how='all') # only drop rows having all NaNs

Unnamed: 0,0,1,2
0,alabama,,
1,california,39536653.0,1246.0
2,minnesota,5576606.0,169.0
4,virginia,8470020.0,222.0


In [26]:
data[3] = np.nan
data

Unnamed: 0,0,1,2,3
0,alabama,,,
1,california,39536653.0,1246.0,
2,minnesota,5576606.0,169.0,
3,,,,
4,virginia,8470020.0,222.0,


In [27]:
data.dropna(axis=1, how='all') # drop columns having all NaNs

Unnamed: 0,0,1,2
0,alabama,,
1,california,39536653.0,1246.0
2,minnesota,5576606.0,169.0
3,,,
4,virginia,8470020.0,222.0


In [28]:
data.iloc[2, 2] = np.nan
data

Unnamed: 0,0,1,2,3
0,alabama,,,
1,california,39536653.0,1246.0,
2,minnesota,5576606.0,,
3,,,,
4,virginia,8470020.0,222.0,


In [29]:
data.dropna(thresh=2) # drop rows having fewer than 2 non-NaN values

Unnamed: 0,0,1,2,3
1,california,39536653.0,1246.0,
2,minnesota,5576606.0,,
4,virginia,8470020.0,222.0,


In [33]:
data.fillna(0)

Unnamed: 0,0,1,2,3
0,alabama,0.0,0.0,0.0
1,california,39536653.0,1246.0,0.0
2,minnesota,5576606.0,0.0,0.0
3,0,0.0,0.0,0.0
4,virginia,8470020.0,222.0,0.0


In [34]:
data.fillna({1:1000000, 2:50})

Unnamed: 0,0,1,2,3
0,alabama,1000000.0,50.0,
1,california,39536653.0,1246.0,
2,minnesota,5576606.0,50.0,
3,,1000000.0,50.0,
4,virginia,8470020.0,222.0,


In [35]:
data.fillna(data.mean())

Unnamed: 0,0,1,2,3
0,alabama,17861093.0,734.0,
1,california,39536653.0,1246.0,
2,minnesota,5576606.0,734.0,
3,,17861093.0,734.0,
4,virginia,8470020.0,222.0,


In [36]:
data = pd.DataFrame([
    ['alabama', np.NAN, np.NAN],
    ['california', 39536653, 1246],
    ['virginia', 8470020, 222],
    ['california', 39536653, 1246],
    ['minnesota', 5576606, 169],
    [np.NAN, np.NAN, np.NAN],
    ['virginia', 8470020, 222]
])

In [37]:
data.duplicated()

0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

In [39]:
data.drop_duplicates()


Unnamed: 0,0,1,2
0,alabama,,
1,california,39536653.0,1246.0
2,virginia,8470020.0,222.0
4,minnesota,5576606.0,169.0
5,,,


# Replacing Text

In [40]:
dollars = pd.Series(['12', '-$10', '$10,000'])
dollars

0         12
1       -$10
2    $10,000
dtype: object

In [41]:
dollars.str.replace('$', '')

0        12
1       -10
2    10,000
dtype: object

In [42]:
dollars

0         12
1       -$10
2    $10,000
dtype: object

In [43]:
dollars = dollars.str.replace('$', '')
dollars

0        12
1       -10
2    10,000
dtype: object

# Regular Expression (regex) Replace

In [44]:
import re
dollars = pd.Series(['12', '-$10', '$10,000'])
dollars.apply(lambda s: re.sub('\$', '', s))

0        12
1       -10
2    10,000
dtype: object

In [45]:
states = pd.Series(['Caaliforni.', 'Maaryl.nd', 'Al.baama', 'Virginiaa'])
states

0    Caaliforni.
1      Maaryl.nd
2       Al.baama
3      Virginiaa
dtype: object

In [46]:
states.apply(lambda s: re.sub('aa|\.', 'a', s))

0    California
1      Maryland
2       Alabama
3      Virginia
dtype: object

In [47]:
data = pd.read_csv('data/states.csv')
data
# any guesses as to what count represents?

Unnamed: 0,State,Abbrev,Count,Population
0,Alabama,AL,129,4874747.0
1,Alaska,AK,35,739795.0
2,American Samoa,AS,1,51504.0
3,Arizona,AZ,155,7016270.0
4,Arkansas,AR,108,3004279.0
5,California,CA,1246,39536653.0
6,Colorado,CO,171,5607154.0
7,Connecticut,CT,114,3588184.0
8,Delaware,DE,23,961939.0
9,District of Columbia,DC,33,693972.0


In [48]:
bins = [0, 10, 100, 500, 1000, 2000]

In [49]:
schools = pd.cut(data['Count'], bins)
schools

0       (100, 500]
1        (10, 100]
2          (0, 10]
3       (100, 500]
4       (100, 500]
5     (1000, 2000]
6       (100, 500]
7       (100, 500]
8        (10, 100]
9        (10, 100]
10         (0, 10]
11      (100, 500]
12      (100, 500]
13         (0, 10]
14       (10, 100]
15       (10, 100]
16      (100, 500]
17      (100, 500]
18      (100, 500]
19       (10, 100]
20      (100, 500]
21      (100, 500]
22       (10, 100]
23         (0, 10]
24      (100, 500]
25      (100, 500]
26      (100, 500]
27      (100, 500]
28       (10, 100]
29      (100, 500]
30       (10, 100]
31       (10, 100]
32       (10, 100]
33       (10, 100]
34      (100, 500]
35       (10, 100]
36     (500, 1000]
37      (100, 500]
38       (10, 100]
39         (0, 10]
40      (100, 500]
41      (100, 500]
42      (100, 500]
43         (0, 10]
44     (500, 1000]
45      (100, 500]
46       (10, 100]
47       (10, 100]
48       (10, 100]
49      (100, 500]
50     (500, 1000]
51       (10, 100]
52       (10

In [50]:
pd.value_counts(schools)

(100, 500]      27
(10, 100]       21
(0, 10]          7
(500, 1000]      3
(1000, 2000]     1
Name: Count, dtype: int64

In [51]:
data.describe()

Unnamed: 0,Count,Population
count,59.0,54.0
mean,159.949153,6097689.0
std,200.886005,7216798.0
min,1.0,51504.0
25%,36.0,1499889.0
50%,108.0,4036820.0
75%,181.5,6977157.0
max,1246.0,39536650.0


In [52]:
pd.qcut(data['Count'], 4)

0      (108.0, 181.5]
1       (0.999, 36.0]
2       (0.999, 36.0]
3      (108.0, 181.5]
4       (36.0, 108.0]
5     (181.5, 1246.0]
6      (108.0, 181.5]
7      (108.0, 181.5]
8       (0.999, 36.0]
9       (0.999, 36.0]
10      (0.999, 36.0]
11    (181.5, 1246.0]
12    (181.5, 1246.0]
13      (0.999, 36.0]
14      (36.0, 108.0]
15      (0.999, 36.0]
16    (181.5, 1246.0]
17     (108.0, 181.5]
18      (36.0, 108.0]
19      (36.0, 108.0]
20     (108.0, 181.5]
21     (108.0, 181.5]
22      (36.0, 108.0]
23      (0.999, 36.0]
24     (108.0, 181.5]
25    (181.5, 1246.0]
26    (181.5, 1246.0]
27     (108.0, 181.5]
28      (36.0, 108.0]
29    (181.5, 1246.0]
30      (36.0, 108.0]
31      (36.0, 108.0]
32      (36.0, 108.0]
33      (36.0, 108.0]
34    (181.5, 1246.0]
35      (36.0, 108.0]
36    (181.5, 1246.0]
37    (181.5, 1246.0]
38      (0.999, 36.0]
39      (0.999, 36.0]
40    (181.5, 1246.0]
41     (108.0, 181.5]
42     (108.0, 181.5]
43      (0.999, 36.0]
44    (181.5, 1246.0]
45     (10

In [53]:
cats = pd.qcut(data['Count'], 4)
pd.value_counts(cats)

(181.5, 1246.0]    15
(36.0, 108.0]      15
(0.999, 36.0]      15
(108.0, 181.5]     14
Name: Count, dtype: int64

In [54]:
pop = data['Population']
pop[pop > 5_000_000]

3      7016270.0
5     39536653.0
6      5607154.0
11    20984400.0
12    10429379.0
16    12802023.0
17     6666818.0
24     6052177.0
25     6859819.0
26     9962311.0
27     5576606.0
29     6113532.0
34     9005644.0
36    19849399.0
37    10273419.0
40    11658609.0
44    12805537.0
47     5024369.0
49     6715984.0
50    28304596.0
54     8470020.0
55     7405743.0
57     5795483.0
Name: Population, dtype: float64

In [55]:
data.set_index('State', inplace=True)

In [56]:
data

Unnamed: 0_level_0,Abbrev,Count,Population
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,AL,129,4874747.0
Alaska,AK,35,739795.0
American Samoa,AS,1,51504.0
Arizona,AZ,155,7016270.0
Arkansas,AR,108,3004279.0
California,CA,1246,39536653.0
Colorado,CO,171,5607154.0
Connecticut,CT,114,3588184.0
Delaware,DE,23,961939.0
District of Columbia,DC,33,693972.0


In [57]:
pop = data['Population']
pop[pop > 5_000_000]

State
Arizona            7016270.0
California        39536653.0
Colorado           5607154.0
Florida           20984400.0
Georgia           10429379.0
Illinois          12802023.0
Indiana            6666818.0
Maryland           6052177.0
Massachusetts      6859819.0
Michigan           9962311.0
Minnesota          5576606.0
Missouri           6113532.0
New Jersey         9005644.0
New York          19849399.0
North Carolina    10273419.0
Ohio              11658609.0
Pennsylvania      12805537.0
South Carolina     5024369.0
Tennessee          6715984.0
Texas             28304596.0
Virginia           8470020.0
Washington         7405743.0
Wisconsin          5795483.0
Name: Population, dtype: float64