In [2]:
# Load libraries 

import numpy as np
import pandas as pd

## Handling the missing data

In [3]:
float_data = pd.Series([1,4,2.,np.nan,2])
float_data

0    1.0
1    4.0
2    2.0
3    NaN
4    2.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [5]:
float_data = pd.Series([1,4,2.,np.nan,2, None, 'NULL', 'NA'])
float_data

0       1
1       4
2     2.0
3     NaN
4       2
5    None
6    NULL
7      NA
dtype: object

In [6]:
float_data.isna()

0    False
1    False
2    False
3     True
4    False
5     True
6    False
7    False
dtype: bool

### Filtering out missing data

In [7]:
float_data.dropna()
# float_data

0       1
1       4
2     2.0
4       2
6    NULL
7      NA
dtype: object

In [8]:
float_data[float_data.notna()]

0       1
1       4
2     2.0
4       2
6    NULL
7      NA
dtype: object

In [9]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
 ....: [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [10]:
data.dropna() # If there is single NA in the row full row will be dropped. 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
data.dropna(how='all') # If all are NA in row then it drop the row

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [12]:
df = pd.DataFrame(np.random.standard_normal((7,3)))
df

Unnamed: 0,0,1,2
0,0.241191,-0.12417,-0.422073
1,-0.820361,-0.36744,1.659685
2,-0.060324,0.362261,0.580583
3,-0.411538,0.069087,-0.77227
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


In [13]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.241191,,
1,-0.820361,,
2,-0.060324,,0.580583
3,-0.411538,,-0.77227
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


In [14]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


In [15]:
df.dropna( thresh=2)

Unnamed: 0,0,1,2
2,-0.060324,,0.580583
3,-0.411538,,-0.77227
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


### Filling in the missing data

In [16]:
df

Unnamed: 0,0,1,2
0,0.241191,,
1,-0.820361,,
2,-0.060324,,0.580583
3,-0.411538,,-0.77227
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


In [17]:
df.fillna(100.) # Fill with scaler value

Unnamed: 0,0,1,2
0,0.241191,100.0,100.0
1,-0.820361,100.0,100.0
2,-0.060324,100.0,0.580583
3,-0.411538,100.0,-0.77227
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


In [18]:
df.fillna({1:0.5, 2:6}) # fillna with dictionary

Unnamed: 0,0,1,2
0,0.241191,0.5,6.0
1,-0.820361,0.5,6.0
2,-0.060324,0.5,0.580583
3,-0.411538,0.5,-0.77227
4,-0.895571,1.201073,-0.108876
5,1.903754,-1.103896,-0.033193
6,-0.012519,1.737178,-1.01591


In [19]:
df = pd.DataFrame(np.random.standard_normal((6,3)))
df

Unnamed: 0,0,1,2
0,-0.803877,1.682091,-1.441161
1,-0.346045,1.540726,-2.456785
2,0.543731,0.991601,-0.096525
3,0.123042,-0.726019,-1.01886
4,-0.677942,0.367985,2.323189
5,-0.544242,-0.541476,2.626729


In [20]:
df.iloc[4:,1] = np.nan
df.iloc[2:,2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.803877,1.682091,-1.441161
1,-0.346045,1.540726,-2.456785
2,0.543731,0.991601,
3,0.123042,-0.726019,
4,-0.677942,,
5,-0.544242,,


In [21]:
df.fillna(method='ffill') # Fill forward

Unnamed: 0,0,1,2
0,-0.803877,1.682091,-1.441161
1,-0.346045,1.540726,-2.456785
2,0.543731,0.991601,-2.456785
3,0.123042,-0.726019,-2.456785
4,-0.677942,-0.726019,-2.456785
5,-0.544242,-0.726019,-2.456785


In [22]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.803877,1.682091,-1.441161
1,-0.346045,1.540726,-2.456785
2,0.543731,0.991601,-2.456785
3,0.123042,-0.726019,-2.456785
4,-0.677942,-0.726019,
5,-0.544242,-0.726019,


In [23]:
df.fillna(df.mean()) # fill with mean

Unnamed: 0,0,1,2
0,-0.803877,1.682091,-1.441161
1,-0.346045,1.540726,-2.456785
2,0.543731,0.991601,-1.948973
3,0.123042,-0.726019,-1.948973
4,-0.677942,0.872099,-1.948973
5,-0.544242,0.872099,-1.948973


In [24]:
df.mean()

0   -0.284222
1    0.872099
2   -1.948973
dtype: float64

## Data Transformation

In [25]:
## Removing duplicates 
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],"k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [26]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [27]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [28]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [29]:
data.drop_duplicates(subset=['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [30]:
data.drop_duplicates(subset=['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## Transforming Data using a Functions or Mappings

In [31]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
 ....: "pastrami", "corned beef", "bacon",
 ....: "pastrami", "honey ham", "nova lox"],
 ....: "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [32]:
meat_to_animal = {
 "bacon": "pig",
 "pulled pork": "pig",
 "pastrami": "cow",
 "corned beef": "cow",
 "honey ham": "pig",
 "nova lox": "salmon"
}

In [33]:
def f(x):
    return meat_to_animal[x]

In [34]:
data['animal'] = data['food'].map(f)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


## replacing Values

In [35]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [36]:
data.replace(-999,10)

0       1.0
1      10.0
2       2.0
3      10.0
4   -1000.0
5       3.0
dtype: float64

In [37]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [38]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [39]:
data.replace({-999:100,-1000:np.nan})

0      1.0
1    100.0
2      2.0
3    100.0
4      NaN
5      3.0
dtype: float64

### Renaming Axis Indexes

In [43]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
 ....: index=["Ohio", "Colorado", "New York"],
 ....: columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [44]:
def transform(x):
    return x.upper()

data.index.map(transform)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [45]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [46]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [47]:
data.rename(index=str.title, columns=str.upper) # Create a copy

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [48]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


### Discretization and Binning

In [49]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

In [51]:
ages_categories = pd.cut(ages, bins)
ages_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [53]:
ages_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [54]:
ages_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [56]:
ages_categories.categories[0] # ] Inclusive ( exclusive)

Interval(18, 25, closed='right')

In [57]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

In [58]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [59]:
data = np.random.uniform(size = 20)
data

array([0.57533124, 0.40557723, 0.72361713, 0.86038129, 0.78235826,
       0.80854932, 0.50869853, 0.51494098, 0.19417043, 0.48543245,
       0.16877746, 0.20676666, 0.11226428, 0.91334758, 0.50339288,
       0.35309575, 0.9100039 , 0.7114949 , 0.82785088, 0.05730066])

In [61]:
nor = pd.cut(data, 4, precision=2)
nor

[(0.49, 0.7], (0.27, 0.49], (0.7, 0.91], (0.7, 0.91], (0.7, 0.91], ..., (0.27, 0.49], (0.7, 0.91], (0.7, 0.91], (0.7, 0.91], (0.056, 0.27]]
Length: 20
Categories (4, interval[float64, right]): [(0.056, 0.27] < (0.27, 0.49] < (0.49, 0.7] < (0.7, 0.91]]

In [62]:
nor.codes

array([2, 1, 3, 3, 3, 3, 2, 2, 0, 2, 0, 0, 0, 3, 2, 1, 3, 3, 3, 0],
      dtype=int8)

In [63]:
nor.categories

IntervalIndex([(0.056, 0.27], (0.27, 0.49], (0.49, 0.7], (0.7, 0.91]], dtype='interval[float64, right]')

In [64]:
data = np.random.standard_normal(1000)

quartiles = pd.cut(data, 4, precision=2)
quartiles

[(1.64, 3.17], (-1.41, 0.12], (-1.41, 0.12], (-1.41, 0.12], (-2.94, -1.41], ..., (0.12, 1.64], (0.12, 1.64], (-1.41, 0.12], (0.12, 1.64], (0.12, 1.64]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.94, -1.41] < (-1.41, 0.12] < (0.12, 1.64] < (1.64, 3.17]]

In [65]:
pd.value_counts(quartiles)

(-1.41, 0.12]     468
(0.12, 1.64]      390
(-2.94, -1.41]     97
(1.64, 3.17]       45
Name: count, dtype: int64

In [66]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-2.939, -1.392]     100
(-1.392, -0.0789]    400
(-0.0789, 1.275]     400
(1.275, 3.17]        100
Name: count, dtype: int64

### Detecting and filtering outliers

In [67]:
data = pd.DataFrame(np.random.standard_normal((1000,4)))
data

Unnamed: 0,0,1,2,3
0,-0.887064,-1.354290,1.254145,-0.187443
1,0.784040,-0.280997,0.565044,-0.290985
2,2.034066,0.279668,-1.063195,0.352107
3,0.670436,0.355248,-0.196630,-0.539345
4,0.236526,0.398929,1.650915,1.723530
...,...,...,...,...
995,-2.079873,0.719011,0.957993,0.526545
996,-1.117084,0.610638,0.742614,1.433616
997,-0.019564,-1.426104,-0.038725,1.188377
998,0.236351,0.842103,0.057925,-0.954968


In [68]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.034139,-0.003585,0.033945,-0.012461
std,0.981699,1.023555,1.007274,1.021784
min,-3.283594,-3.187698,-2.775576,-3.489003
25%,-0.657454,-0.701048,-0.648866,-0.681554
50%,0.021648,-0.049355,0.025576,0.020093
75%,0.68761,0.708669,0.705417,0.690886
max,2.870351,3.581964,3.550972,3.31011


In [69]:
col = data[2]
col

0      1.254145
1      0.565044
2     -1.063195
3     -0.196630
4      1.650915
         ...   
995    0.957993
996    0.742614
997   -0.038725
998    0.057925
999    0.077097
Name: 2, Length: 1000, dtype: float64

In [72]:
col[col.abs() >3]

512    3.192151
952    3.550972
Name: 2, dtype: float64

In [76]:
# To select all rows having value greater than 3 or -3

data[(data.abs()>3).any(axis="columns")]

Unnamed: 0,0,1,2,3
216,-1.442182,-3.187698,0.310089,1.538104
298,0.668046,-0.350033,0.573433,-3.489003
306,0.482351,2.702099,1.104628,-3.212979
436,-1.183708,3.581964,0.507583,-0.423677
512,-1.207773,1.15003,3.192151,-3.347486
584,-3.038244,-0.50932,0.368957,-1.608802
808,-0.550706,-0.450768,-2.37118,3.31011
871,-3.283594,-0.788756,0.997972,0.78221
952,-0.184731,0.54082,3.550972,2.275421


In [80]:
data[data.abs()>3] = np.sign(data)*3 # -1 or 1 based on positive or negative
data

Unnamed: 0,0,1,2,3
0,-0.887064,-1.354290,1.254145,-0.187443
1,0.784040,-0.280997,0.565044,-0.290985
2,2.034066,0.279668,-1.063195,0.352107
3,0.670436,0.355248,-0.196630,-0.539345
4,0.236526,0.398929,1.650915,1.723530
...,...,...,...,...
995,-2.079873,0.719011,0.957993,0.526545
996,-1.117084,0.610638,0.742614,1.433616
997,-0.019564,-1.426104,-0.038725,1.188377
998,0.236351,0.842103,0.057925,-0.954968


In [81]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.034461,-0.003979,0.033202,-0.011721
std,0.980661,1.02111,1.004911,1.017542
min,-3.0,-3.0,-2.775576,-3.0
25%,-0.657454,-0.701048,-0.648866,-0.681554
50%,0.021648,-0.049355,0.025576,0.020093
75%,0.68761,0.708669,0.705417,0.690886
max,2.870351,3.0,3.0,3.0


### Permutation and Random Sampling

In [82]:
data = pd.DataFrame(np.arange(35).reshape(5,7))
data

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [83]:
sampler = np.random.permutation(5)
sampler

array([2, 3, 4, 0, 1])

In [85]:
data.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13


In [86]:
data.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13


In [87]:
column_sampler = np.random.permutation(7)
column_sampler

array([4, 0, 3, 1, 6, 2, 5])

In [89]:
data.take(column_sampler, axis="columns")

Unnamed: 0,4,0,3,1,6,2,5
0,4,0,3,1,6,2,5
1,11,7,10,8,13,9,12
2,18,14,17,15,20,16,19
3,25,21,24,22,27,23,26
4,32,28,31,29,34,30,33


In [90]:
choices = pd.Series([5,7,-1,6,4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [92]:
choices.sample(n=10, replace=True)

2   -1
4    4
0    5
2   -1
3    6
2   -1
3    6
0    5
1    7
1    7
dtype: int64

In [95]:
# Converting categorical variable into dummy or indicator matrix

data = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                   "values":range(6)})
data

Unnamed: 0,key,values
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [102]:
out = pd.get_dummies(data['key'],)
out

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [114]:
def f(x):
    return int(x)


In [115]:
out.applymap(f)

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [118]:
dummies = pd.get_dummies(data['key'], prefix="key")
dummies.applymap(f)

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [119]:
data[['values']].join(dummies.applymap(f))

Unnamed: 0,values,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [120]:
mnames = ['movie_id', 'title', 'genres']
mnames

['movie_id', 'title', 'genres']

In [122]:
movies = pd.read_table('movies.dat', sep="::", header=None, names=mnames, engine="python", encoding="ISO-8859-1")
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [123]:
dummies = movies['genres'].str.get_dummies("|")

In [125]:
dummies.iloc[:10, :6]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


In [126]:
movies_windic = movies.join(dummies.add_prefix("Gener_"))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Gener_Action                                   0
Gener_Adventure                                0
Gener_Animation                                1
Gener_Children's                               1
Gener_Comedy                                   1
Gener_Crime                                    0
Gener_Documentary                              0
Gener_Drama                                    0
Gener_Fantasy                                  0
Gener_Film-Noir                                0
Gener_Horror                                   0
Gener_Musical                                  0
Gener_Mystery                                  0
Gener_Romance                                  0
Gener_Sci-Fi                                   0
Gener_Thriller                                 0
Gener_War                                      0
Gener_Western       