In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 7.1 Handling Missing Data

In [15]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado', None])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object

In [16]:
#isnull
#isna
#notnull
#notna
    #~isnull
    #~isna

In [17]:
print(string_data)
string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object


0    False
1    False
2     True
3    False
4     True
dtype: bool

In [18]:
print(string_data)
string_data.isna()

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object


0    False
1    False
2     True
3    False
4     True
dtype: bool

In [19]:
print(string_data)
string_data.notnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object


0     True
1     True
2    False
3     True
4    False
dtype: bool

In [20]:
print(string_data)
string_data.notna()

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object


0     True
1     True
2    False
3     True
4    False
dtype: bool

In [21]:
print(string_data)
string_data[string_data.notnull()]
string_data[string_data.notna()]

string_data[~string_data.isnull()]
string_data[~string_data.isna()]

string_data[(string_data.notna()) | (string_data.notnull())]
string_data[(~string_data.isnull()) | (~string_data.isna())]

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object


0     aardvark
1    artichoke
3      avocado
dtype: object

In [25]:
print(string_data)
string_data[string_data.isnull()]
string_data[string_data.isna()]

string_data[~string_data.notnull()]
string_data[~string_data.notna()]

string_data[(string_data.isna()) | (string_data.isnull())]
string_data[(~string_data.notnull()) | (~string_data.notna())]

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object


2     NaN
4    None
dtype: object

# Table 7-1. NA handling methods

In [26]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [27]:
print(data)
data.dropna()

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64


0    1.0
2    3.5
4    7.0
dtype: float64

In [28]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [29]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [30]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [31]:
print(data)
data.dropna(how='all')

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [32]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [33]:
data.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [43]:
a = data.notnull()
a[1] = True
print(data)
data[a.any(1)]


     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [39]:
a[1]

0     True
1    False
2    False
3     True
Name: 1, dtype: bool

In [34]:
data.dropna(axis=1, how="any")

0
1
2
3


In [35]:
df = pd.DataFrame(np.random.randn(10, 3))
df

Unnamed: 0,0,1,2
0,0.665479,-0.060842,0.001493
1,-0.85079,-0.433324,-1.110633
2,0.130581,0.32917,-0.544605
3,1.066937,-0.531684,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [44]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.665479,,
1,-0.85079,,
2,0.130581,,-0.544605
3,1.066937,,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [45]:
print(df)
df.dropna()

          0         1         2
0  0.665479       NaN       NaN
1 -0.850790       NaN       NaN
2  0.130581       NaN -0.544605
3  1.066937       NaN  0.493165
4 -0.912124  0.558521 -0.082402
5 -0.293482  1.124538 -1.059928
6 -0.861221  0.088222 -0.363365
7  0.341094  0.464822 -1.077764
8 -0.472780 -0.959349  1.875812
9 -0.209755  1.792539 -1.048965


Unnamed: 0,0,1,2
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [47]:
print(df)
df.dropna(thresh=2)

          0         1         2
0  0.665479       NaN       NaN
1 -0.850790       NaN       NaN
2  0.130581       NaN -0.544605
3  1.066937       NaN  0.493165
4 -0.912124  0.558521 -0.082402
5 -0.293482  1.124538 -1.059928
6 -0.861221  0.088222 -0.363365
7  0.341094  0.464822 -1.077764
8 -0.472780 -0.959349  1.875812
9 -0.209755  1.792539 -1.048965


Unnamed: 0,0,1,2
2,0.130581,,-0.544605
3,1.066937,,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


# Filling In Missing Data


In [48]:
print(df)

          0         1         2
0  0.665479       NaN       NaN
1 -0.850790       NaN       NaN
2  0.130581       NaN -0.544605
3  1.066937       NaN  0.493165
4 -0.912124  0.558521 -0.082402
5 -0.293482  1.124538 -1.059928
6 -0.861221  0.088222 -0.363365
7  0.341094  0.464822 -1.077764
8 -0.472780 -0.959349  1.875812
9 -0.209755  1.792539 -1.048965


In [49]:
print(df)
df.fillna(0)

          0         1         2
0  0.665479       NaN       NaN
1 -0.850790       NaN       NaN
2  0.130581       NaN -0.544605
3  1.066937       NaN  0.493165
4 -0.912124  0.558521 -0.082402
5 -0.293482  1.124538 -1.059928
6 -0.861221  0.088222 -0.363365
7  0.341094  0.464822 -1.077764
8 -0.472780 -0.959349  1.875812
9 -0.209755  1.792539 -1.048965


Unnamed: 0,0,1,2
0,0.665479,0.0,0.0
1,-0.85079,0.0,0.0
2,0.130581,0.0,-0.544605
3,1.066937,0.0,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [50]:
df.fillna({1:7, 2:8})

Unnamed: 0,0,1,2
0,0.665479,7.0,8.0
1,-0.85079,7.0,8.0
2,0.130581,7.0,-0.544605
3,1.066937,7.0,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [53]:
df[1].notnull().sum() / len(df)

0.6

In [54]:
df.mean()

0   -0.139606
1    0.511549
2   -0.226007
dtype: float64

In [55]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.665479,0.511549,-0.226007
1,-0.85079,0.511549,-0.226007
2,0.130581,0.511549,-0.544605
3,1.066937,0.511549,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [56]:
df.head()

Unnamed: 0,0,1,2
0,0.665479,,
1,-0.85079,,
2,0.130581,,-0.544605
3,1.066937,,0.493165
4,-0.912124,0.558521,-0.082402


In [57]:
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.665479,0.0,0.0
1,-0.85079,0.0,0.0
2,0.130581,0.0,-0.544605
3,1.066937,0.0,0.493165
4,-0.912124,0.558521,-0.082402
5,-0.293482,1.124538,-1.059928
6,-0.861221,0.088222,-0.363365
7,0.341094,0.464822,-1.077764
8,-0.47278,-0.959349,1.875812
9,-0.209755,1.792539,-1.048965


In [63]:
df

Unnamed: 0,A,B,C
0,-0.137414,0.665976,-1.379565
1,0.708705,-1.009767,0.246167
2,-0.867084,-1.28925,1.741698
3,-0.150554,0.265544,-0.726902
4,0.186725,0.214529,-0.435985
5,-0.069532,0.947609,-0.534237
6,0.977887,0.058692,1.034017
7,0.121569,0.460611,0.482037
8,1.218895,-2.004051,-0.096025


In [62]:
df = pd.DataFrame(np.random.randn(9, 3), columns=["A","B","C"])
df.iloc[1:6,0] # indexing with number like numpy slicing


1    0.708705
2   -0.867084
3   -0.150554
4    0.186725
5   -0.069532
Name: A, dtype: float64

In [64]:
df = pd.DataFrame(np.random.randn(9, 3), columns=["A","B","C"])
df.loc[1:6,["A","C"]] # indexing with number like numpy slicing


Unnamed: 0,A,C
1,-1.612874,-1.122245
2,-1.013422,-1.630173
3,1.473592,-0.593895
4,-1.460795,-0.522552
5,-0.367635,0.03824
6,1.542258,1.618051


In [65]:
df = pd.DataFrame(np.random.randn(9, 3))
df.iloc[2:-2, 1] = NA
df.iloc[4:-1, 2] = NA
df

Unnamed: 0,0,1,2
0,0.020316,-0.940747,0.368698
1,0.375648,0.310586,-0.701541
2,1.675981,,0.127723
3,-0.236656,,1.324719
4,-0.859853,,
5,-0.932076,,
6,-0.489084,,
7,1.012582,0.289574,
8,-0.578619,-0.438523,0.561989


In [66]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.020316,-0.940747,0.368698
1,0.375648,0.310586,-0.701541
2,1.675981,0.310586,0.127723
3,-0.236656,0.310586,1.324719
4,-0.859853,0.310586,1.324719
5,-0.932076,0.310586,1.324719
6,-0.489084,0.310586,1.324719
7,1.012582,0.289574,1.324719
8,-0.578619,-0.438523,0.561989


In [67]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,0.020316,-0.940747,0.368698
1,0.375648,0.310586,-0.701541
2,1.675981,0.289574,0.127723
3,-0.236656,0.289574,1.324719
4,-0.859853,0.289574,0.561989
5,-0.932076,0.289574,0.561989
6,-0.489084,0.289574,0.561989
7,1.012582,0.289574,0.561989
8,-0.578619,-0.438523,0.561989


In [68]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.020316,-0.940747,0.368698
1,0.375648,0.310586,-0.701541
2,1.675981,0.310586,0.127723
3,-0.236656,0.310586,1.324719
4,-0.859853,,1.324719
5,-0.932076,,1.324719
6,-0.489084,,
7,1.012582,0.289574,
8,-0.578619,-0.438523,0.561989


In [73]:
df.fillna(df.iloc[[0,-1]].mean())

Unnamed: 0,0,1,2
0,0.020316,-0.940747,0.368698
1,0.375648,0.310586,-0.701541
2,1.675981,-0.689635,0.127723
3,-0.236656,-0.689635,1.324719
4,-0.859853,-0.689635,0.465343
5,-0.932076,-0.689635,0.465343
6,-0.489084,-0.689635,0.465343
7,1.012582,0.289574,0.465343
8,-0.578619,-0.438523,0.561989


In [74]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 7.2 Data Transformation


In [75]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [77]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [78]:
data[data.duplicated()]

Unnamed: 0,k1,k2
6,two,4


In [79]:
data[~data.duplicated()]

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [76]:
print(data)
data.drop_duplicates()

    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4
6  two   4


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [80]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [81]:
print(data)
data.drop_duplicates(subset=["k1"])

    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [83]:
print(data)
data.drop_duplicates(subset=["k1"], keep='last')

    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [82]:
print(data)
data.drop_duplicates(subset=["k1","k2"])

    k1  k2  v1
0  one   1   0
1  two   1   1
2  one   2   2
3  two   3   3
4  one   3   4
5  two   4   5
6  two   4   6


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


### Transforming Data Using a Function or Mapping

In [84]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [86]:
meat_to_animal = {
 'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'
}
data['animal'] = data.food.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,
4,corned beef,7.5,cow
5,Bacon,8.0,
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [87]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [88]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

# Replacing Values


In [89]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [90]:
print(data)
data.replace(-999, np.nan)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64


0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [91]:
print(data)
data.replace([-999,-1000], np.nan)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64


0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [93]:
print(data)
data.replace([-999,-1000], [100,200])

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64


0      1.0
1    100.0
2      2.0
3    100.0
4    200.0
5      3.0
dtype: float64

In [94]:
print(data)
data.replace({-999:100,-1000:300})

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64


0      1.0
1    100.0
2      2.0
3    100.0
4    300.0
5      3.0
dtype: float64

# Renaming Axis Indexes

In [103]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [104]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [105]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [106]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [108]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [109]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

In [110]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


# Discretization and Binning

In [112]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,25,35,50,60,120]
cat = pd.cut(ages, bins)
cat

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 120], (35, 50], (35, 50], (25, 35]]
Length: 12
Categories (5, interval[int64]): [(18, 25] < (25, 35] < (35, 50] < (50, 60] < (60, 120]]

In [113]:
cat.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 4, 2, 2, 1], dtype=int8)

In [114]:
cat.categories

IntervalIndex([(18, 25], (25, 35], (35, 50], (50, 60], (60, 120]],
              closed='right',
              dtype='interval[int64]')

In [115]:
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 120), [35, 50), [35, 50), [25, 35)]
Length: 12
Categories (5, interval[int64]): [[18, 25) < [25, 35) < [35, 50) < [50, 60) < [60, 120)]

In [117]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [0,.5, .75, .9, 1]
pd.qcut(ages, bins).value_counts()

(19.999, 29.0]    6
(29.0, 38.0]      3
(38.0, 44.6]      1
(44.6, 61.0]      2
dtype: int64

## Detecting and Filtering Outliers


In [118]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,0.767712,-0.469772,0.249250,0.878506
1,1.794054,1.244375,-1.845974,-0.528469
2,-0.958615,-1.587007,0.340827,-0.059512
3,-0.275209,0.630122,0.171549,0.133010
4,-0.486673,-0.927338,-1.738145,0.624909
...,...,...,...,...
995,2.154025,-0.168481,-1.374168,-1.074865
996,0.644794,-0.826750,0.735990,0.765491
997,0.225223,0.502254,-0.375580,1.996605
998,0.913600,0.813377,1.914590,0.593962


In [119]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.004309,0.015545,-0.007318,-0.023916
std,1.000519,0.961561,1.017554,0.942982
min,-3.26038,-2.633195,-2.932966,-3.505991
25%,-0.689865,-0.630651,-0.733187,-0.670161
50%,0.003487,0.01852,-0.04119,-0.065816
75%,0.674793,0.671952,0.68991,0.635625
max,3.399816,2.689341,3.423626,3.028487


In [121]:
col = data[2]
col

0      0.249250
1     -1.845974
2      0.340827
3      0.171549
4     -1.738145
         ...   
995   -1.374168
996    0.735990
997   -0.375580
998    1.914590
999   -0.895059
Name: 2, Length: 1000, dtype: float64

In [122]:
col[np.abs(col) > 3]


119    3.078348
154    3.044130
312    3.423626
738    3.177045
Name: 2, dtype: float64

In [123]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
119,-1.000981,1.147991,3.078348,-0.117907
150,-3.26038,-0.501289,1.392055,-0.055032
154,0.67439,1.579572,3.04413,-1.197135
312,-0.670444,0.07374,3.423626,0.316307
384,-3.219875,-0.167007,-0.767962,0.15143
738,0.826466,-0.235991,3.177045,-0.291
786,3.399816,1.529809,-0.755585,-1.02871
889,0.325454,0.517962,-1.018337,-3.505991
919,0.760022,2.089573,1.658919,3.028487


In [124]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [125]:
data

Unnamed: 0,0,1,2,3
0,0.767712,-0.469772,0.249250,0.878506
1,1.794054,1.244375,-1.845974,-0.528469
2,-0.958615,-1.587007,0.340827,-0.059512
3,-0.275209,0.630122,0.171549,0.133010
4,-0.486673,-0.927338,-1.738145,0.624909
...,...,...,...,...
995,2.154025,-0.168481,-1.374168,-1.074865
996,0.644794,-0.826750,0.735990,0.765491
997,0.225223,0.502254,-0.375580,1.996605
998,0.913600,0.813377,1.914590,0.593962


In [126]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.004228,0.015545,-0.008041,-0.023439
std,0.997736,0.961561,1.015304,0.941154
min,-3.0,-2.633195,-2.932966,-3.0
25%,-0.689865,-0.630651,-0.733187,-0.670161
50%,0.003487,0.01852,-0.04119,-0.065816
75%,0.674793,0.671952,0.68991,0.635625
max,3.0,2.689341,3.0,3.0


In [127]:
np.sign(data).head()


Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,1.0,-1.0,-1.0
2,-1.0,-1.0,1.0,-1.0
3,-1.0,1.0,1.0,1.0
4,-1.0,-1.0,-1.0,1.0


# Permutation and Random Sampling


In [128]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [133]:
sampler = np.random.permutation(5)
sampler

array([1, 2, 0, 4, 3])

In [135]:
sampler = np.random.permutation(5)
sampler
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
3,12,13,14,15


In [136]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11


In [137]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

0    5
2   -1
0    5
1    7
0    5
3    6
1    7
1    7
1    7
3    6
dtype: int64

## Computing Indicator/Dummy Variables


In [138]:
a = pd.Series(['A',"C",'C',"A","B"])
a

0    A
1    C
2    C
3    A
4    B
dtype: object

In [139]:
pd.get_dummies(a)

Unnamed: 0,A,B,C
0,1,0,0
1,0,0,1
2,0,0,1
3,1,0,0
4,0,1,0


In [140]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [146]:
data.take(data.iloc[:,[0,1]].dropna().index)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
