In [1]:
import numpy as np
import pandas as pd


### Handling Missing Data

isnull(), nonull()

In [3]:
s = pd.Series(["red", "blue", np.nan, "green"])
s

0      red
1     blue
2      NaN
3    green
dtype: object

In [4]:
s.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
s[0] = None
s

0     None
1     blue
2      NaN
3    green
dtype: object

In [6]:
s.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Missing Data

dropna()

In [8]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])

In [9]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
data.dropna(how='all') # Drops rows which contains all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data[4] = NA # Makes entire column NA

In [15]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(how='all', axis=1) # Drop column wise

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.82022,-0.954165,0.397929
1,-0.146679,0.770131,0.444475
2,0.605091,1.915129,-1.265928
3,1.435799,-0.052863,0.659507
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


In [21]:
df.iloc[:4, 1] = NA  # [row, col]
df.iloc[:2, 2] = NA

In [22]:
df

Unnamed: 0,0,1,2
0,-0.82022,,
1,-0.146679,,
2,0.605091,,-1.265928
3,1.435799,,0.659507
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


In [23]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


In [24]:
df.dropna(thresh=2) # Drops rows which has 2 or more NA

Unnamed: 0,0,1,2
2,0.605091,,-1.265928
3,1.435799,,0.659507
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


### Filling Missing Data

fillna()

In [25]:
df

Unnamed: 0,0,1,2
0,-0.82022,,
1,-0.146679,,
2,0.605091,,-1.265928
3,1.435799,,0.659507
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


In [26]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.82022,0.0,0.0
1,-0.146679,0.0,0.0
2,0.605091,0.0,-1.265928
3,1.435799,0.0,0.659507
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


In [27]:
df.fillna({1: 0.5, 2: 0}) # 1 column with 0.5 and 2ns with 0

Unnamed: 0,0,1,2
0,-0.82022,0.5,0.0
1,-0.146679,0.5,0.0
2,0.605091,0.5,-1.265928
3,1.435799,0.5,0.659507
4,-1.852353,-1.42112,-0.565956
5,1.861511,1.660407,0.212956
6,-0.914062,-0.08798,-0.035469


In [28]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.732576,1.16007,0.335322
1,0.605153,0.272527,-1.945175
2,0.594569,,-0.197785
3,0.609251,,0.168507
4,-0.550128,,
5,-0.185741,,


In [29]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.732576,1.16007,0.335322
1,0.605153,0.272527,-1.945175
2,0.594569,0.272527,-0.197785
3,0.609251,0.272527,0.168507
4,-0.550128,0.272527,0.168507
5,-0.185741,0.272527,0.168507


In [30]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.732576,1.16007,0.335322
1,0.605153,0.272527,-1.945175
2,0.594569,0.272527,-0.197785
3,0.609251,0.272527,0.168507
4,-0.550128,,0.168507
5,-0.185741,,0.168507


In [31]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Data Transformation

### 1. Removing Duplicates

duplicated(), drop_duplicates()

In [32]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [33]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [34]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [36]:
data['v1'] = range(7) # Add v1 column filled with values 0 - 7
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [37]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


duplicated and drop_duplicates by default keep the first observed value combina‐
tion. Passing keep='last' will return the last one

In [38]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 2. Transforming Data Using a Function or Mapping

In [39]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [40]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [42]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [43]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 3. Replacing Values

In [44]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [45]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [46]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [47]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [48]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 4. Renaming Axis

In [58]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [52]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [59]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [55]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [60]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [61]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 5. Discretization and Binning

In [62]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [63]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

###### cut() => Bin values into discrete intervals.

Use cut when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable.

Internally it contains a categories array specifying the dis‐
tinct category names along with a labeling for the ages data in the codes attribute

In [64]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [65]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [66]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [67]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [69]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names) # Assigning group names to bins

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

###### If you pass an integer number of bins to cut instead of explicit bin edges, it will compute equal-length bins based on the minimum and maximum values in the data.

In [70]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.041, 0.28], (0.75, 0.98], (0.041, 0.28], (0.75, 0.98], (0.041, 0.28], ..., (0.041, 0.28], (0.28, 0.51], (0.041, 0.28], (0.28, 0.51], (0.75, 0.98]]
Length: 20
Categories (4, interval[float64]): [(0.041, 0.28] < (0.28, 0.51] < (0.51, 0.75] < (0.75, 0.98]]

##### qcut() => bins the data based on sample quantiles. Dependingon the distribution of the data, using cut will not usually result in each bin having thesame number of data points. Since qcut uses sample quantiles instead, by definitionyou will obtain roughly equal-size bins

In [71]:
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats

[(-0.678, 0.0161], (-3.553, -0.678], (-3.553, -0.678], (-0.678, 0.0161], (-0.678, 0.0161], ..., (0.716, 2.876], (0.0161, 0.716], (0.716, 2.876], (0.0161, 0.716], (0.0161, 0.716]]
Length: 1000
Categories (4, interval[float64]): [(-3.553, -0.678] < (-0.678, 0.0161] < (0.0161, 0.716] < (0.716, 2.876]]

In [72]:
pd.value_counts(cats)

(0.716, 2.876]      250
(0.0161, 0.716]     250
(-0.678, 0.0161]    250
(-3.553, -0.678]    250
dtype: int64

In [73]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.301, 0.0161], (-1.301, 0.0161], (-1.301, 0.0161], (-1.301, 0.0161], (-1.301, 0.0161], ..., (0.0161, 1.324], (0.0161, 1.324], (1.324, 2.876], (0.0161, 1.324], (0.0161, 1.324]]
Length: 1000
Categories (4, interval[float64]): [(-3.553, -1.301] < (-1.301, 0.0161] < (0.0161, 1.324] < (1.324, 2.876]]

In [75]:
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]))

(0.0161, 1.324]     400
(-1.301, 0.0161]    400
(1.324, 2.876]      100
(-3.553, -1.301]    100
dtype: int64

### 6. Detecting and Filtering Outliers

In [90]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.025669,-0.008248,-0.010171,-0.032281
std,1.024333,1.027746,1.016287,0.999231
min,-3.339777,-3.013357,-3.751245,-3.011659
25%,-0.621404,-0.666743,-0.667329,-0.733172
50%,-0.009581,-0.009284,0.01655,-0.050103
75%,0.701485,0.669457,0.674834,0.682599
max,3.764847,3.080145,3.824436,2.97675


In [91]:
# In the second column, find all values greated than 3
col = data[2]
col[np.abs(col) > 3]

12    -3.182508
106    3.824436
294   -3.057516
802   -3.751245
Name: 2, dtype: float64

In [96]:
# To select all rows having a value exceeding 3 or –3, 
# you can use the any method on a
# boolean DataFrame
# any() -> Return whether any element is True, potentially over an axis.
# axis{0 or ‘index’, 1 or ‘columns’, None}, default 0
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
4,-0.027231,3.080145,-1.869086,-0.665262
12,0.671267,0.393647,-3.182508,0.751017
24,-3.135884,-2.860011,0.29198,-1.424511
106,-1.87236,0.936764,3.824436,1.017961
294,0.632313,-0.068452,-3.057516,2.136404
354,3.062224,-1.057773,0.601253,0.902287
399,3.031616,-2.914104,-0.417865,-1.222047
469,1.178973,-0.416887,-1.118398,-3.011659
693,3.764847,-1.249547,1.395939,0.995098
781,-3.339777,-1.191097,-0.284009,0.715537


In [84]:
# Setting values based on the criteria
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.000567,-0.006499,-0.02749,0.034511
std,1.01429,0.989089,0.989007,0.992705
min,-3.0,-3.0,-3.0,-3.0
25%,-0.677504,-0.685314,-0.693564,-0.660747
50%,0.020641,-0.001631,-0.075216,0.069413
75%,0.702423,0.6603,0.633756,0.705102
max,3.0,2.873886,3.0,3.0


The statement np.sign(data) produces 1 and –1 values based on whether the values
in data are positive or negative

In [85]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3


In [86]:
data[(np.abs(data) >= 3).any(1)]

Unnamed: 0,0,1,2,3
71,0.431576,0.109594,-3.0,-0.314405
89,-3.0,-0.67853,0.045164,1.112314
102,0.150408,-0.137839,-3.0,-0.428524
149,-0.163738,-0.220292,-3.0,0.272643
270,0.575194,-3.0,1.591793,-0.690046
296,-1.936641,-3.0,-0.103986,-0.12331
405,-0.296659,1.599311,-0.82316,3.0
423,-0.127465,-0.75313,1.569067,-3.0
484,0.658366,-0.552557,3.0,0.501649
535,2.195036,-3.0,1.17984,0.650094


In [87]:
data.head()

Unnamed: 0,0,1,2,3
0,-1.169203,0.739242,-0.476072,-0.553315
1,-0.650998,0.485911,-0.633149,0.097673
2,-1.301411,-0.592976,0.88921,-0.059069
3,-0.520799,-0.542223,-0.346549,-1.234744
4,1.385731,1.752408,-1.279156,1.298148


### 7. Permutations and Random Sampling

In [98]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do
using the numpy.random.permutation function. Calling permutation with the length
of the axis you want to permute produces an array of integers indicating the new
ordering

In [99]:
sampler = np.random.permutation(5) # Can be used with iloc[]
sampler

array([2, 3, 4, 0, 1])

In [101]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7


In [102]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7


To select a random subset without replacement, you can use the sample method on
Series and DataFrame

In [103]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19


To generate a sample with replacement (to allow repeat choices), pass replace=True
to sample

In [104]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

1    7
4    4
0    5
1    7
2   -1
2   -1
2   -1
4    4
1    7
2   -1
dtype: int64

### 8. Computing Indicatior/Dummy Variables

Another type of transformation for statistical modeling or machine learning applica‐
tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a
column in a DataFrame has k distinct values, you would derive a matrix or Data‐
Frame with k columns containing all 1s and 0s. pandas has a get_dummies function
for doing this, though devising one yourself is not difficult

In [106]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [107]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [108]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies) # Join two dfs
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [109]:
# Real Example

In [111]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(r'datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)
movies[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [112]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)

In [113]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [120]:
# One way to construct the indicator DataFrame is to start 
# with a DataFrame of all zeros
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [121]:
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
# Now, iterate through each movie and set entries in each row of dummies to 1. 
# To do this, we use the dummies.columns to compute the column indices for 
# each genre
gen = movies.genres[0]
gen.split('|')
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [116]:
# Then, we can use .iloc to set values based on these indices
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [118]:
# Combine them with movies
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [119]:
# A useful recipe for statistical applications is to combine get_dummies
# with a discretization function like cut

np.random.seed(12345)
values = np.random.rand(10)
values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


## String Manipulation

All python strings functions can be used
Regular Expressions can be used as well


### Vectorized String Functions in Pandas

In [122]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [123]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [124]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [126]:
import re
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [133]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [135]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object