In [1]:
#Discretization and Binning

import pandas as pd
import numpy as np 
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [2]:
cats.codes


array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [3]:
cats.categories


IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [4]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [5]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [6]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [7]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)    

[(0.05, 0.28], (0.05, 0.28], (0.74, 0.97], (0.51, 0.74], (0.05, 0.28], ..., (0.51, 0.74], (0.05, 0.28], (0.74, 0.97], (0.05, 0.28], (0.74, 0.97]]
Length: 20
Categories (4, interval[float64, right]): [(0.05, 0.28] < (0.28, 0.51] < (0.51, 0.74] < (0.74, 0.97]]

In [8]:
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats

[(0.663, 3.433], (-0.585, 0.0925], (0.663, 3.433], (-0.585, 0.0925], (0.663, 3.433], ..., (0.0925, 0.663], (0.663, 3.433], (0.663, 3.433], (-0.585, 0.0925], (0.663, 3.433]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9419999999999997, -0.585] < (-0.585, 0.0925] < (0.0925, 0.663] < (0.663, 3.433]]

In [9]:
pd.value_counts(cats)


(-2.9419999999999997, -0.585]    250
(-0.585, 0.0925]                 250
(0.0925, 0.663]                  250
(0.663, 3.433]                   250
dtype: int64

In [10]:
 pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(1.149, 3.433], (-1.202, 0.0925], (1.149, 3.433], (-1.202, 0.0925], (0.0925, 1.149], ..., (0.0925, 1.149], (0.0925, 1.149], (0.0925, 1.149], (-1.202, 0.0925], (1.149, 3.433]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.9419999999999997, -1.202] < (-1.202, 0.0925] < (0.0925, 1.149] < (1.149, 3.433]]

In [11]:
#Detecting and Filtering Outliers

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
    

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.003969,0.007114,0.007126,-0.034339
std,0.995189,0.993916,1.073593,0.973401
min,-3.542798,-2.988488,-3.267778,-2.977303
25%,-0.697899,-0.674655,-0.71117,-0.683199
50%,0.058551,-0.039955,0.029689,-0.041912
75%,0.6963,0.722444,0.739216,0.61876
max,2.857698,3.304442,3.185261,2.994982


In [12]:
col = data[2]
col[np.abs(col) > 3]


37     3.185261
46     3.181765
131    3.116926
804   -3.267778
Name: 2, dtype: float64

In [13]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
37,-0.406002,0.711512,3.185261,-0.554913
46,-0.548756,0.112043,3.181765,-0.624179
131,-0.496762,0.457059,3.116926,0.196264
423,2.150817,3.304442,0.171775,-0.555612
481,-3.203305,1.391415,-0.0192,-0.288135
804,-0.631854,-0.081769,-3.267778,1.233406
993,-3.542798,-1.058203,-0.525558,-0.262642


In [14]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.004715,0.006809,0.00691,-0.034339
std,0.992763,0.992951,1.071418,0.973401
min,-3.0,-2.988488,-3.0,-2.977303
25%,-0.697899,-0.674655,-0.71117,-0.683199
50%,0.058551,-0.039955,0.029689,-0.041912
75%,0.6963,0.722444,0.739216,0.61876
max,2.857698,3.0,3.0,2.994982


In [15]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,-1.0
1,1.0,-1.0,1.0,-1.0
2,-1.0,1.0,1.0,1.0
3,-1.0,1.0,-1.0,-1.0
4,1.0,1.0,1.0,1.0


In [16]:
#Permutation and Random Sampling

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

array([4, 0, 3, 2, 1])

In [17]:
df.take(sampler)

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7


In [18]:
df.sample(n=3)


Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
0,0,1,2,3


In [20]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

2   -1
4    4
1    7
1    7
4    4
4    4
0    5
0    5
4    4
1    7
dtype: int64

In [21]:
# Computing Indicator/Dummy Variables

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
     'data1': range(6)})
pd.get_dummies(df['key'])


Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [22]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [23]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
header=None, names=mnames)
movies[:10]

  movies = pd.read_table('datasets/movielens/movies.dat', sep='::',


FileNotFoundError: [Errno 2] No such file or directory: 'datasets/movielens/movies.dat'

In [24]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres


NameError: name 'movies' is not defined

In [25]:
#7.3 String Manipulation
#String Object Methods

val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

In [26]:
pieces = [x.strip() for x in val.split(',')]  #split is often combined with strip to trim whitespace
pieces

['a', 'b', 'guido']

In [27]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [28]:
'::'.join(pieces)


'a::b::guido'

In [29]:
'guido' in val

True

In [30]:
val.index(',')

1

In [31]:
val.find(':')


-1

In [32]:
val.count(',')

2

In [33]:
val.replace(',', '::')


'a::b:: guido'

In [34]:
val.replace(',', '')

'ab guido'

In [35]:
# Regular Expressions

import re
text = "foo bar\t baz \tqux"
re.split('\s+', text)


['foo', 'bar', 'baz', 'qux']

In [36]:
regex = re.compile('\s+')
regex.split(text)


['foo', 'bar', 'baz', 'qux']

In [37]:
regex.findall(text)  # to get a list of all patterns matching the regex


[' ', '\t ', ' \t']

In [38]:
# 7.4 Vectorized String Functions in pandas

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
     'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data


Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [39]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [40]:
data.str.contains('gmail')


Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [46]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
#regex = re.compile(pattern, flags=re.IGNORECASE)
#m = regex.match('wesm@bright.net')
#m.groups()


In [47]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [48]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [49]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [50]:
matches.str.get(1)


AttributeError: Can only use .str accessor with string values!

In [51]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object