In [1]:
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 10
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [15]:
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, NA], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, NA]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,
1,1.0,,
2,,,
3,,6.5,


In [16]:
data.dropna()

Unnamed: 0,0,1,2


In [17]:
data.dropna(how="all", axis = 0)

Unnamed: 0,0,1,2
0,1.0,6.5,
1,1.0,,
3,,6.5,


In [18]:
data.dropna(how = "all", axis = 1)

Unnamed: 0,0,1
0,1.0,6.5
1,1.0,
2,,
3,,6.5


In [20]:
data.dropna(thresh = 2) #Drop rows with >= 2 NAs

Unnamed: 0,0,1,2
0,1.0,6.5,


In [21]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,0.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,0.0


In [23]:
data.ffill() #Fill Na with the last observed data

Unnamed: 0,0,1,2
0,1.0,6.5,
1,1.0,6.5,
2,1.0,6.5,
3,1.0,6.5,


In [30]:
data[data[0].duplicated()]

Unnamed: 0,0,1,2
1,1.0,,
3,,6.5,


In [31]:
data.drop_duplicates(0)

Unnamed: 0,0,1,2
0,1.0,6.5,
2,,,


In [32]:
data.drop_duplicates(0, keep = "last")

Unnamed: 0,0,1,2
1,1.0,,
3,,6.5,


In [35]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

lowercased = data['food'].str.lower()

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [36]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [41]:
a = data['food'].map(lambda x: meat_to_animal[x.lower()])
data["new_animal"] = a
data

Unnamed: 0,food,ounces,animal,new_animal
0,bacon,4.0,pig,pig
1,pulled pork,3.0,pig,pig
2,bacon,12.0,pig,pig
3,Pastrami,6.0,cow,cow
4,corned beef,7.5,cow,cow
5,Bacon,8.0,pig,pig
6,pastrami,3.0,cow,cow
7,honey ham,5.0,pig,pig
8,nova lox,6.0,salmon,salmon


In [42]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [43]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [53]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [57]:
data.index  = data.index.map(lambda x: x[:4].upper())
data

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [60]:
data.rename(index={'OHIO': 'INDIANA'}, 
            columns={'THREE': 'peekaboo'}, inplace = True)
data 

Unnamed: 0,ONE,TWO,peekaboo,FOUR
Indiana,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [61]:
data.rename(index=str.title, columns=str.upper, inplace = True)
data

Unnamed: 0,ONE,TWO,PEEKABOO,FOUR
Indiana,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [62]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [128]:
bins = [18, 25, 35, 60, 100]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats_age = pd.cut(ages, bins, labels = group_names)
cats_age

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [129]:
print(cats_age.codes)
print(cats_age.categories)

[0 0 0 1 0 0 2 1 3 2 2 1]
Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')


In [130]:
pd.value_counts(cats_age)

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
Name: count, dtype: int64

In [135]:
dummy_age = pd.get_dummies(cats_age)
dummy_age

Unnamed: 0,Youth,YoungAdult,MiddleAged,Senior
0,True,False,False,False
1,True,False,False,False
2,True,False,False,False
3,False,True,False,False
4,True,False,False,False
...,...,...,...,...
7,False,True,False,False
8,False,False,False,True
9,False,False,True,False
10,False,False,True,False


In [131]:
data = np.random.rand(20)
cats = pd.cut(data, 4, precision=2)   # The precision=2 option limits the decimal precision to two digits.
#Equal-length bins

In [132]:
pd.value_counts(cats)
#The length is 0.24 in this case

(0.76, 0.97]    6
(0.15, 0.36]    5
(0.56, 0.76]    5
(0.36, 0.56]    4
Name: count, dtype: int64

In [76]:
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats

[(-0.0171, 0.624], (0.624, 3.928], (-2.9499999999999997, -0.691], (-2.9499999999999997, -0.691], (-2.9499999999999997, -0.691], ..., (-0.0171, 0.624], (-0.0171, 0.624], (-2.9499999999999997, -0.691], (0.624, 3.928], (0.624, 3.928]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9499999999999997, -0.691] < (-0.691, -0.0171] < (-0.0171, 0.624] < (0.624, 3.928]]

In [77]:
pd.value_counts(cats)

(-2.9499999999999997, -0.691]    250
(-0.691, -0.0171]                250
(-0.0171, 0.624]                 250
(0.624, 3.928]                   250
Name: count, dtype: int64

In [82]:
np.random.seed(12345)
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.555730
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.438570
...,...,...,...,...
995,1.089085,0.251232,-1.451985,1.653126
996,-0.478509,-0.010663,-1.060881,-1.502870
997,-1.946267,1.013592,0.037333,0.133304
998,-1.293122,-0.322542,-0.782960,-0.303340


In [83]:
#Finding values in column 2 exceeding 3 in absolute value
data[2][np.abs(data[2]) > 3]

5      3.248944
102    3.176873
324    3.260383
499   -3.056990
586   -3.184377
Name: 2, dtype: float64

In [85]:
#Select all rows having a value exceeding 3 or -3
data[(np.abs(data) > 3).any(axis = 1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.565230,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
...,...,...,...,...
499,-0.293333,-0.242459,-3.056990,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [86]:
#Select all rows having all values exceeding 3 or -3
data[(np.abs(data) > 3).all(axis = 1)]

Unnamed: 0,0,1,2,3


In [87]:
data[np.abs(data) > 3] = np.sign(data) * 3   # cap values outside the inter‐ val –3 to 3

In [88]:
np.sign(data).head()  # The statement np.sign(data) produces 1 and –1 values based on 
                      # whether the values in data are positive or negative:

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,-1.0
3,1.0,1.0,1.0,1.0
4,-1.0,-1.0,1.0,-1.0


In [91]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

array([1, 3, 4, 0, 2])

In [92]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [94]:
df.take(sampler)   #Index re-ordered

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


In [95]:
df.iloc[sampler,:]

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3
2,8,9,10,11


In [97]:
df.sample(n=3, replace = False) #Random subset

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
4,16,17,18,19


In [98]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': np.arange(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [104]:
dummies = pd.get_dummies(df["key"], prefix="key")
dummies

Unnamed: 0,key_a,key_b,key_c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False


In [105]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,False,True,False
1,1,False,True,False
2,2,True,False,False
3,3,False,False,True
4,4,True,False,False
5,5,False,True,False


In [107]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('../data/movielens/movies.dat', sep='::', encoding='ISO-8859-1', 
                       header=None, names=mnames)
print(movies.shape)
movies[:10]

  movies = pd.read_table('../data/movielens/movies.dat', sep='::', encoding='ISO-8859-1',


(3883, 3)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [111]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
    
genres = pd.unique(all_genres)

In [112]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [114]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

In [118]:
?pd.Index.get_indexer

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mIndex[0m[0;34m.[0m[0mget_indexer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtarget[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmethod[0m[0;34m:[0m [0;34m'str_t | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit[0m[0;34m:[0m [0;34m'int | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtolerance[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'npt.NDArray[np.intp]'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute indexer and mask for new index given the current index.

The indexer should be then used as an input to ndarray.take to align the
current data to the new index.

Parameters
----------
target : Index
method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
    * default: exact matches only.
    * pad / ffill: find the PR

In [124]:
for (i, gen) in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split("|"))
    dummies.iloc[i, indices] = 1

dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                      1
title                          Toy Story (1995)
genres              Animation|Children's|Comedy
Genre_Animation                             1.0
Genre_Children's                            1.0
                               ...             
Genre_War                                   0.0
Genre_Musical                               0.0
Genre_Mystery                               0.0
Genre_Film-Noir                             0.0
Genre_Western                               0.0
Name: 0, Length: 21, dtype: object