7.1 Handling Missing Data

In [4]:
import pandas as pd
import numpy as np


string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print('\n')
print(string_data.isnull())

string_data[0] = None
print('\n')
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


0    False
1    False
2     True
3    False
dtype: bool


0     True
1    False
2     True
3    False
dtype: bool


In [6]:
# Filtering Out Missing Data

from numpy import nan as NA


data = pd.Series([1, NA, 3.5, NA, 7])
print(data.dropna())
print('\n')
print(data[data.notnull()])



0    1.0
2    3.5
4    7.0
dtype: float64


0    1.0
2    3.5
4    7.0
dtype: float64


In [9]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
print(data)
print('\n')
print(cleaned)
print('\n')
print(data.dropna(how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


     0    1    2
0  1.0  6.5  3.0


     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0


In [11]:
data[4] = NA
print('\n')
print(data)
print('\n')
print(data.dropna(axis=1, how='all'))



     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN


     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [14]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
print(df)
print('\n')
print(df.dropna())
print('\n')
print(df.dropna(thresh=2))

          0         1         2
0 -2.028763       NaN       NaN
1 -1.519579       NaN       NaN
2  1.335718       NaN  0.422142
3 -1.774236       NaN  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
2  1.335718       NaN  0.422142
3 -1.774236       NaN  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


In [17]:
# Filling In Missing Data

print(df.fillna(0))
print('\n')
print(df.fillna({1: 0.5, 2: 0}))
print('\n')

_ = df.fillna(0, inplace=True)

print(df)

          0         1         2
0 -2.028763  0.000000  0.000000
1 -1.519579  0.000000  0.000000
2  1.335718  0.000000  0.422142
3 -1.774236  0.000000  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
0 -2.028763  0.500000  0.000000
1 -1.519579  0.500000  0.000000
2  1.335718  0.500000  0.422142
3 -1.774236  0.500000  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


          0         1         2
0 -2.028763  0.000000  0.000000
1 -1.519579  0.000000  0.000000
2  1.335718  0.000000  0.422142
3 -1.774236  0.000000  1.076331
4  1.733116  0.957610  0.016238
5 -1.023956 -0.991103  1.137911
6  0.793807  0.261012  0.587224


In [23]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
print(df)
print('\n')
print(df.ffill())
print('\n')
print(df.ffill(limit=2))


          0         1         2
0 -0.567748  0.384288  0.453660
1  0.232388 -0.070936  0.407820
2  0.622049       NaN  1.659861
3  1.029840       NaN -0.219562
4  0.012250       NaN       NaN
5  0.053815       NaN       NaN


          0         1         2
0 -0.567748  0.384288  0.453660
1  0.232388 -0.070936  0.407820
2  0.622049 -0.070936  1.659861
3  1.029840 -0.070936 -0.219562
4  0.012250 -0.070936 -0.219562
5  0.053815 -0.070936 -0.219562


          0         1         2
0 -0.567748  0.384288  0.453660
1  0.232388 -0.070936  0.407820
2  0.622049 -0.070936  1.659861
3  1.029840 -0.070936 -0.219562
4  0.012250       NaN -0.219562
5  0.053815       NaN -0.219562


In [24]:
data = pd.Series([1., NA, 3.5, NA, 7])
print(data.fillna(data.mean()))

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64


7.2 Data Transformation

In [27]:
# Removing Duplicates

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 'k2': [1 , 1, 2, 3, 3, 4, 4]})
print(data)
print('\n')
print(data.duplicated())
print('\n')
print(data.drop_duplicates())

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool


    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4


In [29]:
data['v1'] = range(7)
print(data.drop_duplicates(['k1']))
print('\n')
print(data.drop_duplicates(['k1', 'k2'], keep='last'))

    k1  k2  v1
0  one   1   0
1  two   1   1


    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
6  two   4   6


In [30]:
# Transforming Data Using a Function or Mapping

data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                              "pastrami", "corned beef", "bacon",
                              "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

print(data)

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     pastrami     6.0
4  corned beef     7.5
5        bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0


In [33]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

lowercased = data['food'].str.lower()
print(lowercased)

data['animal'] = lowercased.map(meat_to_animal)
print('\n')
print(data)

print('\n')
print(data['food'].map(lambda x: meat_to_animal[x.lower()]))

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object


          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     pastrami     6.0     cow
4  corned beef     7.5     cow
5        bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon


0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object


In [39]:
# Replacing Values

data = pd.Series([1., -999., 2., -999., -1000., 3.])
print(data)

print('\n')
print(data.replace(-999, np.nan))

print('\n')
print(data.replace([-999, -1000], [np.nan, 0]))

print('\n')
print(data.replace({-999: np.nan, -1000: 0}))

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64


0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64


0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64


0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64


In [44]:
# Renaming Axis Indexes

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])

transform = lambda x: x[:4].upper()

print(data.index.map(transform))

data.index = data.index.map(transform)
print('\n')
print(data)

print('\n')
print(data.rename(index=str.title, columns=str.upper))

print('\n')
print(data.rename(index={'OHIO': 'INDIANA'},
                  columns={'three': "peekaboo"}))

print('\n')
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
print(data)


Index(['OHIO', 'COLO', 'NEW '], dtype='object')


      one  two  three  four
OHIO    0    1      2     3
COLO    4    5      6     7
NEW     8    9     10    11


      ONE  TWO  THREE  FOUR
Ohio    0    1      2     3
Colo    4    5      6     7
New     8    9     10    11


         one  two  peekaboo  four
INDIANA    0    1         2     3
COLO       4    5         6     7
NEW        8    9        10    11


         one  two  three  four
INDIANA    0    1      2     3
COLO       4    5      6     7
NEW        8    9     10    11


In [49]:
# Discretization and Binning

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)

print(cats)

print('\n')
print(cats.codes)

print('\n')
print(cats.categories)

print('\n')
print(pd.Series(cats).value_counts())

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]


[0 0 0 1 0 0 2 1 3 2 2 1]


IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64


In [51]:
print(pd.cut(ages, [18, 26, 36, 61, 100], right=False))

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

print('\n')
print(pd.cut(ages, bins, labels=group_names))

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]


['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']


In [53]:
data = np.random.rand(20)

print(pd.cut(data, 4, precision=2))

[(0.49, 0.73], (0.25, 0.49], (0.011, 0.25], (0.73, 0.97], (0.25, 0.49], ..., (0.73, 0.97], (0.73, 0.97], (0.25, 0.49], (0.011, 0.25], (0.73, 0.97]]
Length: 20
Categories (4, interval[float64, right]): [(0.011, 0.25] < (0.25, 0.49] < (0.49, 0.73] < (0.73, 0.97]]


In [57]:
data = np.random.randn(1000)

cats = pd.qcut(data, 4)

print(cats)

print('\n')
print(pd.Series(cats).value_counts())

print('\n')
print(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]))


[(-0.645, 0.014], (-3.671, -0.645], (-3.671, -0.645], (-0.645, 0.014], (-3.671, -0.645], ..., (0.652, 3.071], (-3.671, -0.645], (0.014, 0.652], (0.014, 0.652], (-3.671, -0.645]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.671, -0.645] < (-0.645, 0.014] < (0.014, 0.652] < (0.652, 3.071]]


(-3.671, -0.645]    250
(-0.645, 0.014]     250
(0.014, 0.652]      250
(0.652, 3.071]      250
Name: count, dtype: int64


[(-1.191, 0.014], (-3.671, -1.191], (-3.671, -1.191], (-1.191, 0.014], (-1.191, 0.014], ..., (1.296, 3.071], (-1.191, 0.014], (0.014, 1.296], (0.014, 1.296], (-1.191, 0.014]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.671, -1.191] < (-1.191, 0.014] < (0.014, 1.296] < (1.296, 3.071]]


In [63]:
# Detecting and Filtering Outliers

data = pd.DataFrame(np.random.randn(1000, 4))
print(data.describe())

col = data[2]
print('\n')
print(col[np.abs(col) > 3])

print('\n')
print(data[(np.abs(data) > 3).any(axis=1)])


                 0            1            2            3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean     -0.033010     0.016672     0.012142     0.022012
std       0.990666     1.012490     0.996610     1.005842
min      -3.263810    -3.843500    -3.065117    -3.010715
25%      -0.728516    -0.690827    -0.630444    -0.683149
50%      -0.015236     0.008915     0.023953     0.015071
75%       0.601612     0.658319     0.694385     0.698724
max       3.083048     3.251246     3.152629     3.988036


215    3.027740
447   -3.065117
809    3.152629
975    3.041676
Name: 2, dtype: float64


            0         1         2         3
19   3.083048  0.376179 -0.684811  1.322135
24  -3.255191 -0.954380 -0.198117  0.743064
34   1.133686  3.251246 -0.082881  1.252398
42  -0.669542  0.464018 -0.427926  3.134081
86   2.326094 -3.843500 -0.074533  0.857413
215  0.034498 -0.229680  3.027740  0.353640
428 -0.179374 -1.436013 -1.125471  3.988036
447 -0.069111 -1.328282 -3.065117 

In [65]:
data[np.abs(data) > 3] = np.sign(data) * 3
print(data.describe())

print('\n')
print(np.sign(data).head())

                 0            1            2            3
count  1000.000000  1000.000000  1000.000000  1000.000000
mean     -0.032253     0.017175     0.011985     0.020900
std       0.987783     1.008578     0.995731     1.001982
min      -3.000000    -3.000000    -3.000000    -3.000000
25%      -0.728516    -0.690827    -0.630444    -0.683149
50%      -0.015236     0.008915     0.023953     0.015071
75%       0.601612     0.658319     0.694385     0.698724
max       3.000000     3.000000     3.000000     3.000000


     0    1    2    3
0  1.0  1.0 -1.0  1.0
1  1.0  1.0 -1.0 -1.0
2  1.0 -1.0  1.0 -1.0
3  1.0  1.0 -1.0  1.0
4  1.0  1.0 -1.0  1.0


In [69]:
# Permutation and Random Sampling

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

sampler = np.random.permutation(5)

print(sampler)

print('\n')
print(df)

print('\n')
print(df.take(sampler))

print('\n')
print(df.sample(n=3))

[0 3 2 1 4]


    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19


    0   1   2   3
0   0   1   2   3
3  12  13  14  15
2   8   9  10  11
1   4   5   6   7
4  16  17  18  19


    0   1   2   3
3  12  13  14  15
1   4   5   6   7
0   0   1   2   3


In [70]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)

print(draws)

2   -1
1    7
4    4
4    4
2   -1
2   -1
4    4
1    7
1    7
1    7
dtype: int64


In [73]:
# Computing Indicator/Dummy Variables

df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                   "data1": range(6)})

print(pd.get_dummies(df['key']))

dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)

print('\n')
print(df_with_dummy)

       a      b      c
0  False   True  False
1  False   True  False
2   True  False  False
3  False  False   True
4   True  False  False
5  False   True  False


   data1  key_a  key_b  key_c
0      0  False   True  False
1      1  False   True  False
2      2   True  False  False
3      3  False  False   True
4      4   True  False  False
5      5  False   True  False


In [77]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/movielens/movies.dat', sep='::', header=None, names=mnames, engine='python')

print(movies[:10])

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy
5         6                         Heat (1995)         Action|Crime|Thriller
6         7                      Sabrina (1995)                Comedy|Romance
7         8                 Tom and Huck (1995)          Adventure|Children's
8         9                 Sudden Death (1995)                        Action
9        10                    GoldenEye (1995)     Action|Adventure|Thriller


In [81]:
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))

genres = pd.unique(pd.Series(all_genres))

print(genres)

['Animation' "Children's" 'Comedy' 'Adventure' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Sci-Fi' 'Documentary' 'War'
 'Musical' 'Mystery' 'Film-Noir' 'Western']


In [84]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)

gen = movies.genres[0]
print(gen.split('|'))

print('\n')
print(dummies.columns.get_indexer(gen.split('|')))

for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

movies_windic = movies.join(dummies.add_prefix('Genre_'))
print('\n')
print(movies_windic.iloc[0])

['Animation', "Children's", 'Comedy']


[0 1 2]


movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Adventure                              0.0
Genre_Fantasy                                0.0
Genre_Romance                                0.0
Genre_Drama                                  0.0
Genre_Action                                 0.0
Genre_Crime                                  0.0
Genre_Thriller                               0.0
Genre_Horror                                 0.0
Genre_Sci-Fi                                 0.0
Genre_Documentary                            0.0
Genre_War                                    0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Film-Noir    

In [87]:
np.random.seed(12345)

values = np.random.rand(10)
print(values)

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

print('\n')
print(pd.get_dummies(pd.cut(values, bins)))

[0.92961609 0.31637555 0.18391881 0.20456028 0.56772503 0.5955447
 0.96451452 0.6531771  0.74890664 0.65356987]


   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0       False       False       False       False        True
1       False        True       False       False       False
2        True       False       False       False       False
3       False        True       False       False       False
4       False       False        True       False       False
5       False       False        True       False       False
6       False       False       False       False        True
7       False       False       False        True       False
8       False       False       False        True       False
9       False       False       False        True       False
