In [None]:
#7.1 Handling Missing Data

In [4]:
import pandas as pd
import numpy as np

In [5]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [6]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [7]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [9]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [10]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [11]:
float_data = pd.Series([1, 2, None], dtype='float64')

In [12]:
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [13]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

In [14]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [15]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
   ....:                      [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])


In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [20]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [21]:
 data[4] = np.nan

In [22]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [23]:
data.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [24]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [25]:
df.iloc[:4, 1] = np.nan

In [26]:
df.iloc[:2, 2] = np.nan

In [27]:
df

Unnamed: 0,0,1,2
0,-0.912653,,
1,1.760794,,
2,0.825719,,1.319391
3,-0.411429,,0.226936
4,0.297192,-1.426229,-0.044607
5,-0.66449,0.121837,1.190728
6,-0.509517,0.037788,0.314817


In [28]:
df.dropna()

Unnamed: 0,0,1,2
4,0.297192,-1.426229,-0.044607
5,-0.66449,0.121837,1.190728
6,-0.509517,0.037788,0.314817


In [29]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.825719,,1.319391
3,-0.411429,,0.226936
4,0.297192,-1.426229,-0.044607
5,-0.66449,0.121837,1.190728
6,-0.509517,0.037788,0.314817


In [30]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.912653,0.0,0.0
1,1.760794,0.0,0.0
2,0.825719,0.0,1.319391
3,-0.411429,0.0,0.226936
4,0.297192,-1.426229,-0.044607
5,-0.66449,0.121837,1.190728
6,-0.509517,0.037788,0.314817


In [31]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.912653,0.5,0.0
1,1.760794,0.5,0.0
2,0.825719,0.5,1.319391
3,-0.411429,0.5,0.226936
4,0.297192,-1.426229,-0.044607
5,-0.66449,0.121837,1.190728
6,-0.509517,0.037788,0.314817


In [32]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))

In [33]:
df.iloc[2:, 1] = np.nan

In [34]:
df.iloc[4:, 2] = np.nan

In [None]:
df

In [None]:
df.fillna(method="ffill")

In [None]:
df.fillna(method="ffill", limit=2)

In [None]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])

In [None]:
data.fillna(data.mean())

In [None]:
7.2 Data Transformation

In [None]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
   ....:                      "k2": [1, 1, 2, 3, 3, 4, 4]})

In [None]:
data

In [None]:
data.duplicated()

In [None]:
data.drop_duplicates()

In [None]:
data["v1"] = range(7)

In [None]:
data.drop_duplicates(subset=["k1"])

In [None]:
data.drop_duplicates(["k1", "k2"], keep="last")

In [None]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
   ....:                               "pastrami", "corned beef", "bacon",
   ....:                               "pastrami", "honey ham", "nova lox"],
   ....:                      "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [None]:
data

In [None]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [None]:
data["animal"] = data["food"].map(meat_to_animal)

In [None]:
data

In [None]:
def get_animal(x):
   ....:     return meat_to_animal[x]

In [None]:
data["food"].map(get_animal)

In [None]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [None]:
data

In [None]:
data.replace(-999, np.nan)

In [None]:
data.replace([-999, -1000], np.nan)

In [None]:
data.replace([-999, -1000], [np.nan, 0])

In [None]:
data.replace({-999: np.nan, -1000: 0})

In [None]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
   ....:                     index=["Ohio", "Colorado", "New York"],
   ....:                     columns=["one", "two", "three", "four"])


In [None]:
def transform(x):
   ....:     return x[:4].upper()

In [None]:
data.index.map(transform)

In [None]:
data.index = data.index.map(transform)

In [None]:
data

In [None]:
data.rename(index=str.title, columns=str.upper) 

In [None]:
data.rename(index={"OHIO": "INDIANA"},
   ....:             columns={"three": "peekaboo"})

In [None]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [None]:
bins = [18, 25, 35, 60, 100]

In [None]:
age_categories = pd.cut(ages, bins)

In [None]:
age_categories

In [None]:
age_categories.codes
age_categories.categories
age_categories.categories[0]
pd.value_counts(age_categories)

In [None]:
pd.cut(ages, bins, right=False)

In [None]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, labels=group_names)

In [None]:
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

In [None]:
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)
quartiles
pd.value_counts(quartiles

In [None]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

In [None]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

In [None]:
col = data[2]
col[col.abs() > 3]

In [None]:
data[(data.abs() > 3).any(axis="columns")]

In [None]:
data[data.abs() > 3] = np.sign(data) * 3
data.describe()

In [None]:
np.sign(data).head()

In [None]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df
sampler = np.random.permutation(5)
sampler

In [None]:
df.take(sampler)
df.iloc[sampler]

In [None]:
column_sampler = np.random.permutation(7)
column_sampler
df.take(column_sampler, axis="columns")

In [None]:
df.sample(n=3)

In [None]:
choices = pd.Series([5, 7, -1, 6, 4])
choices.sample(n=10, replace=True)

In [None]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                   "data1": range(6)})
df
pd.get_dummies(df["key"], dtype=float)

In [None]:
dummies = pd.get_dummies(df["key"], prefix="key", dtype=float)
df_with_dummy = df[["data1"]].join(dummies)
df_with_dummy

In [None]:
mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("datasets/movielens/movies.dat", sep="::",
                       header=None, names=mnames, engine="python")
movies[:10]

In [None]:
dummies = movies["genres"].str.get_dummies("|")
dummies.iloc[:10, :6]

In [None]:
movies_windic = movies.join(dummies.add_prefix("Genre_"))
movies_windic.iloc[0]

In [None]:
np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))