# Lesson 2
* In this lesson we deepen our working knowledge reviewing a number of fundamental operations including:
    * Creating Series and DataFrames
    * Using the bracket operator on a DataFrame
    * Using and chaining accessor methods
    * Using logical accessors with boolean arrays
# Setup

In [1]:
import numpy as np
import pandas as pd

# Constructors
* Pandas Series and DataFrames have a wide variety of contructors
### Creating Series

In [2]:
# Series from list - a ranged index is created
ser = pd.Series([1, 2, 3, 4, 5])
ser

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
# Series from list - set your own index
ser = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [4]:
# Series from dict
ser = pd.Series({'a':1, 'b':2, 'c':3, 'd':4, 'e':5})
ser

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
# Accessing item (notice dictionary semantics) - Pne item returns a scalar
ser['a']

1

In [6]:
# Accessing multiple items - Returning a series
ser[['a', 'b']]

a    1
b    2
dtype: int64

In [9]:
# Naming a Series
ser = pd.Series([1, 2, 3, 4, 5], name='Sample')
ser

0    1
1    2
2    3
3    4
4    5
Name: Sample, dtype: int64

### Creating DataFrames

In [11]:
# It can be handy to construct an empty DataFrame
df = pd.DataFrame(columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c


In [12]:
# DataFrames are collection of series
ser_a = pd.Series([1, 2, 3], name='a')
ser_b = pd.Series([4, 5, 6], name='b')
ser_c = pd.Series([7, 8, 9], name='c')
df = pd.DataFrame([ser_a, ser_b, ser_c])
df

Unnamed: 0,0,1,2
a,1,2,3
b,4,5,6
c,7,8,9


In [14]:
# DataFrames can be constructed from a Dictionary of Lists
df = pd.DataFrame({'a':[1, 2, 3], 'b':[4, 5, 6], 'c':[7, 8, 9]})
df

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [17]:
# DataFrames can be constructed from a List of Dictionaries
df = pd.DataFrame([
    {'a':1, 'b':4, 'c':7 },
    {'a':2, 'b':5, 'c':8 },
    {'a':3, 'b':6, 'c':9 }
])
df

Unnamed: 0,a,b,c
0,1,4,7
1,2,5,8
2,3,6,9


In [19]:
# DataFrames can also be constructed from a List of Lists
list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df = pd.DataFrame(list_of_lists, columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


# Bracket Operator
* Bracket operators invoke the __getitem__ method

In [20]:
# Bracket operator on a DataFrame selects series - Note: A Series object is returned
df['a']

0    1
1    4
2    7
Name: a, dtype: int64

In [21]:
# Brackets can select a list of Series - DataFrame is returned when 2+ Series are selected
df[['b', 'c']]

Unnamed: 0,b,c
0,2,3
1,5,6
2,8,9


# Accessor Methods
* .ix has been deprecated
### loc
* loc is used to select rows by their index name

In [22]:
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [23]:
# Select the second row - note it's name == 1. Note returned as a Series & has name == 1
df.loc[1]

a    4
b    5
c    6
Name: 1, dtype: int64

In [24]:
# Multiple rows - we can use python slicing
df.loc[1:2]

Unnamed: 0,a,b,c
1,4,5,6
2,7,8,9


In [25]:
# If the DataFrame index is RangedIndex(0 - len), loc and iloc return the same items
print(df.loc[1])
print(df.iloc[1])

a    4
b    5
c    6
Name: 1, dtype: int64
a    4
b    5
c    6
Name: 1, dtype: int64


In [26]:
# Set the index to some other numbering
df.index = [10, 11, 12]
df

Unnamed: 0,a,b,c
10,1,2,3
11,4,5,6
12,7,8,9


In [27]:
# df.loc[1] Will fail because 1 is not in the index, but iloc will select the item at position 1
df.iloc[1]

a    4
b    5
c    6
Name: 11, dtype: int64

In [29]:
#iloc can return a DataFrame when more than onw row is returned
# Note the second vlaue of the slicer is used as less than!
df.iloc[1:3]

Unnamed: 0,a,b,c
11,4,5,6
12,7,8,9


### at & iat
* Select one value

In [30]:
df

Unnamed: 0,a,b,c
10,1,2,3
11,4,5,6
12,7,8,9


In [32]:
# Select one column's value using the row index and column's series name
df.at[10, 'a']

1

In [33]:
# Select one column's value using the row's position
df.iat[1, 1]

5

# Chaining
* We can brackets, chain loc, iloc to pick both columns and rows

In [34]:
# Select the row with index name 10, and the 'a' column
df.loc[10]['a']

1

In [35]:
df.loc[10, 'a']

1

In [36]:
# Unlike .at, we can also select multiple rows and columns
df.loc[[10, 11], ['a', 'b']]

Unnamed: 0,a,b
10,1,2
11,4,5


In [38]:
# Select the row with index name 1 and the 'b' column
df.iloc[1]['b']

5

In [39]:
# Unlike .iat, we can also select multipe rows and columns
# Here we take rows 1 & 2 and columns 1 & 2 (0 based)
df.iloc[1:3, 1:3]

Unnamed: 0,b,c
11,5,6
12,8,9


In [40]:
# We can select all rows using :
df.loc[:,]

Unnamed: 0,a,b,c
10,1,2,3
11,4,5,6
12,7,8,9


# Boolean Indexing
* We can use boolean arrays to select rows and columns

In [41]:
# Selecting items can be done using boolean masks
df.iloc[[True, False, True], [True, False, False]]

Unnamed: 0,a
10,1
12,7


In [42]:
# We can use boolean operations on a DataFrame
df > 4

Unnamed: 0,a,b,c
10,False,False,False
11,False,True,True
12,True,True,True


In [43]:
# Boolean operators can be used as conditions for selection
# Note: A NaN is returned for all non-selected items
df[df > 4]

Unnamed: 0,a,b,c
10,,,
11,,5.0,6.0
12,7.0,8.0,9.0


In [45]:
# Booleans can be chained, but you have to remember boolean algebra
# Here we add a second boolean operator to find odd numbers greater than 4
df[(df > 4) & (df % 2)]

Unnamed: 0,a,b,c
10,,,
11,,5.0,
12,7.0,,9.0


# Query

In [46]:
# Query is a simple way to find items in a DataFrame
df.query('a > 3')

Unnamed: 0,a,b,c
11,4,5,6
12,7,8,9


In [47]:
# Operators can be chained
df.query('a > 3 and b < 7')

Unnamed: 0,a,b,c
11,4,5,6


In [48]:
# Referencing external variables
val = 3
df.query('a > @val')

Unnamed: 0,a,b,c
11,4,5,6
12,7,8,9


In [49]:
# Alternatively
val = 3
df.query(f'a > {val}')

Unnamed: 0,a,b,c
11,4,5,6
12,7,8,9


In [50]:
# Irregular names? backtick them
df.query('`a` > 3')

Unnamed: 0,a,b,c
11,4,5,6
12,7,8,9
