In [1]:
import pandas as pd

In [2]:
# Data Selection in Series

In [None]:
# As we saw in the previous section, a Series object acts in many ways like a one-dimensional NumPy array, 
# and in many ways like a standard Python dictionary. If we keep these two overlapping analogies in mind, 
# it will help us to understand the patterns of data indexing and selection in these arrays.

In [None]:
# Series as dictionary

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [4]:
data['b']

0.5

In [7]:
# We can also use dictionary-like Python expressions and methods to examine the keys/indices and values:
'a' in data

False

In [18]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [12]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [19]:
# Series objects can even be modified with a dictionary-like syntax. 
# Just as you can extend a dictionary by assigning to a new key, you can extend a Series by assigning to a new index value:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [None]:
# Series as one-dimensional array

In [None]:
#A Series builds on this dictionary-like interface and provides array-style item selection via the same basic 
# mechanisms as NumPy arrays – that is, slices, masking, and fancy indexing. Examples of these are as follows:

In [20]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [21]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [26]:
# masking
data[(data > 0.25) & (data < 1.25)]

b    0.50
c    0.75
d    1.00
dtype: float64

In [27]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [34]:
# Among these, slicing may be the source of the most confusion. 
# Notice that when slicing with an explicit index (i.e., data['a':'c']), the final index is included in the slice, 
# while when slicing with an implicit index (i.e., data[0:2]), the final index is excluded from the slice.

In [None]:
# Indexers: loc, iloc, and ix

In [None]:
# These slicing and indexing conventions can be a source of confusion. 
# For example, if your Series has an explicit integer index, an indexing operation such as data[1] will use the 
# explicit indices, while a slicing operation like data[1:3] will use the implicit Python-style index.

In [28]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [31]:
# explicit index when indexing
data[1]

'a'

In [32]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [33]:
# Because of this potential confusion in the case of integer indexes, Pandas provides some special indexer attributes 
# that explicitly expose certain indexing schemes. These are not functional methods, but attributes that expose 
# a particular slicing interface to the data in the Series.

In [35]:
# First, the loc attribute allows indexing and slicing that always references the explicit index:
data.loc[1]

'a'

In [38]:
data.loc[1:3]

1    a
3    b
dtype: object

In [39]:
# The iloc attribute allows indexing and slicing that always references the implicit Python-style index:
data.iloc[1]

'b'

In [41]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [None]:
# A third indexing attribute, ix, is a hybrid of the two, and for Series objects is equivalent to standard []-based indexing. 
# The purpose of the ix indexer will become more apparent in the context of DataFrame objects, which we will discuss shortly.

In [None]:
# One guiding principle of Python code is that "explicit is better than implicit." 
# The explicit nature of loc and iloc make them very useful in maintaining clean and readable code; 
# especially in the case of integer indexes, I recommend using these both to make code easier to read and understand, 
# and to prevent subtle bugs due to the mixed indexing/slicing convention.

In [None]:
# Data Selection in DataFrame

In [None]:
# Recall that a DataFrame acts in many ways like a two-dimensional or structured array, 
# and in other ways like a dictionary of Series structures sharing the same index. 
# These analogies can be helpful to keep in mind as we explore data selection within this structure.

In [None]:
# DataFrame as a dictionary

In [3]:
# let's begin with the area and population data from the previous notes:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [5]:
# The individual Series that make up the columns of the DataFrame can be accessed via dictionary-style indexing of the 
# column name:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [7]:
# note that the dictionary style indexing returns a series. to return a dataframe, use the fancy indexing method:
data[['area']]

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


In [49]:
# Equivalently, we can use attribute-style access with column names that are strings:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [56]:
data.area is data['area']

True

In [57]:
# Though this is a useful shorthand, keep in mind that it does not work for all cases! 
# For example, if the column names are not strings, or if the column names conflict with methods of the DataFrame, 
# this attribute-style access is not possible. 
# For example, the DataFrame has a pop() method, so data.pop will point to this rather than the "pop" column:
data.pop is data['pop']

False

In [None]:
# In particular, you should avoid the temptation to try column assignment via attribute 
# (i.e., use data['pop'] = z rather than data.pop = z).

In [11]:
# Like with the Series objects discussed earlier, this dictionary-style syntax can also be used to modify the object, 
# in this case adding a new column:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [None]:
# DataFrame as two-dimensional array

In [66]:
# We can examine the raw underlying data array using the values attribute:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [69]:
# With this picture in mind, many familiar array-like observations can be done on the DataFrame itself. 
# For example, we can transpose the full DataFrame to swap rows and columns:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [72]:
# When it comes to indexing of DataFrame objects, however, it is clear that the dictionary-style indexing of columns precludes 
# our ability to simply treat it as a NumPy array. 

In [73]:
# In particular, passing a single index to an array accesses a row:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [74]:
# and passing a single "index" to a DataFrame accesses a column:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [77]:
# Thus for array-style indexing, we need another convention. 
# Here Pandas again uses the loc, iloc, and ix indexers mentioned earlier. Using the iloc indexer, 
# we can index the underlying array as if it is a simple NumPy array (using the implicit Python-style index), 
# but the DataFrame index and column labels are maintained in the result:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [78]:
# Similarly, using the loc indexer we can index the underlying data in an array-like style 
# but using the explicit index and column names:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [82]:
# The ix indexer allows a hybrid of these two approaches:
data.ix[:3, :'pop']
# EDIT: the ix indexer was deprecated and has been removed as from version 0.2

AttributeError: 'DataFrame' object has no attribute 'ix'

In [None]:
# Keep in mind that for integer indices, the ix indexer is subject to the same potential sources of confusion as discussed 
# for integer-indexed Series objects.

In [15]:
# Any of the familiar NumPy-style data access patterns can be used within these indexers. 
# For example, in the loc indexer we can combine masking and fancy indexing as in the following:

data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [87]:
# Any of these indexing conventions may also be used to set or modify values; this is done in the standard way that you 
# might be accustomed to from working with NumPy:
data.iloc[0, 2] = 92.3
data

Unnamed: 0,area,pop,density
California,423967,38332521,92.3
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [None]:
# To build up your fluency in Pandas data manipulation, I suggest spending some time with a simple DataFrame and exploring 
# the types of indexing, slicing, masking, and fancy indexing that are allowed by these various indexing approaches.

In [None]:
# Additional indexing conventions

In [94]:
# There are a couple extra indexing conventions that might seem at odds with the preceding discussion, 
# but nevertheless can be very useful in practice. First, while indexing refers to columns, slicing refers to rows:
data['Florida':'Illinois']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [95]:
# Such slices can also refer to rows by number rather than by index:
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [17]:
# Similarly, direct masking operations are also interpreted row-wise rather than column-wise:
data[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
