# Setup

In [32]:
"""
Python Data Science Handbook Study Guide
Chapter 3 - Manipulation with Pandas

Author: Nigel Deen
"""

# import the libraries
import pandas as pd
import numpy as np


# Series

In [33]:
# check the version
pd.__version__


'1.3.4'

In [34]:
# pandas series is a 1D array of indexed data
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data


0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [35]:
# pandas series have 'values'
data.values


array([0.25, 0.5 , 0.75, 1.  ])

In [36]:
# and 'index' attributes (type 'pd.Index')
data.index


RangeIndex(start=0, stop=4, step=1)

In [37]:
# you can use the index to slice and access parts of the pandas series
data[1]


0.5

In [38]:
data[1:3]


1    0.50
2    0.75
dtype: float64

The main difference between a NumPy array and a Pandas series object is the index.
* NumPy arrays have **implicitly defined** integer indices.
* Pandas series have **explicitly defined** indices.

In [39]:
# pd.Series(data=data, index=index)
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a", "b", "c", "d"])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [40]:
data["b"]


0.5

In [41]:
# note, you can also use noncontiguous or nonsequential indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[3, 5, 6, 1])
data


3    0.25
5    0.50
6    0.75
1    1.00
dtype: float64

In [42]:
data[1]


1.0

In [43]:
test = pd.Series(["Frog", 1, True])


In [44]:
# you can think of series (typed keys) as a specialized version of the dictionary (arbitrary keys)
population_dict = {
    "California": 120,
    "Texas": 253,
    "New York": 196,
    "Florida": 102,
    "Illinois": 100,
}

population = pd.Series(population_dict)
population


California    120
Texas         253
New York      196
Florida       102
Illinois      100
dtype: int64

In [45]:
# the index is drawn from sorted keys
population["California"]


120

In [46]:
population["California":"New York"]


California    120
Texas         253
New York      196
dtype: int64

# DataFrames

In [47]:
area_dict = {
    "California": 420,
    "Texas": 120,
    "New York": 1209,
    "Florida": 1092,
    "Illinois": 1092,
}
area_dict


{'California': 420,
 'Texas': 120,
 'New York': 1209,
 'Florida': 1092,
 'Illinois': 1092}

In [48]:
area = pd.Series(area_dict)
area

California     420
Texas          120
New York      1209
Florida       1092
Illinois      1092
dtype: int64

In [49]:
# let's make a dataframe from two series
states = pd.DataFrame({"population": population, "area": area})
states

Unnamed: 0,population,area
California,120,420
Texas,253,120
New York,196,1209
Florida,102,1092
Illinois,100,1092


In [50]:
# dataframe object has an index 
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [51]:
# it also has a columns attribute for column labels
states.columns

Index(['population', 'area'], dtype='object')

In [52]:
# a dataframe maps a column name to a series object of column data
states['area']

California     420
Texas          120
New York      1209
Florida       1092
Illinois      1092
Name: area, dtype: int64

In [53]:
# dataframes can be constructed from a list of dictionaries
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [54]:
# if some keys in the dictionary are missing, pandas will fill with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]) # each item represents an observation

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


# Indices

In [55]:
# index can be thought of as an immutable array or as an ordered set
ind = pd.Index([2,3,4,5])
ind

Int64Index([2, 3, 4, 5], dtype='int64')

In [56]:
ind[1]

3

In [57]:
ind[::2]

Int64Index([2, 4], dtype='int64')

In [58]:
# indices have attributes similar to NumPy arrays
print(ind.size, ind.shape, ind.ndim, ind.dtype)


4 (4,) 1 int64


In [60]:
# index values cannot be modified by assignment (immutable)
# ind[1] = 12

In [61]:
# index as an ordered set 
indA = pd.Index([3,5,7,9])
indB = pd.Index([2,3,5,7,11])


In [62]:
# intersection of two indicies
indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

In [63]:
# union of two indices
indA.union(indB)

Int64Index([2, 3, 5, 7, 9, 11], dtype='int64')

In [64]:
# symmetric difference of indices
indA.symmetric_difference(indB)

Int64Index([2, 9, 11], dtype='int64')

# Indexing and Selection

## Data Selection in Series

### Series as dictionary

In [65]:
# map a collection of keys (index) to values
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a", "b", "c", "d"])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [66]:
# access values through keys
data["b"]

0.5

In [67]:
# check if something exists
'a' in data

True

In [68]:
# look at the keys
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [69]:
# list the key-value pairs
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [70]:
# modify series objects
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### Series as a 1D array

In [71]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [72]:
# slice by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [73]:
# slice by implicit integer index
data[:2]

a    0.25
b    0.50
dtype: float64

**NOTE** 

When you are slicing with an ***explicit*** index (data['a':'c']), the final index is ***included***.

When you are slicing with an ***implicit*** index (data[0:4]), the final index is ***excluded***.

In [74]:
# masking
data[(data > 0.3) % (data < 0.8)]

a    0.25
a    0.25
a    0.25
a    0.25
a    0.25
dtype: float64

In [75]:
# fancy indexing
data[['a','e']]

a    0.25
e    1.25
dtype: float64

### Indexers: loc, iloc, and ix

In [76]:
data = pd.Series(['a','b','c'], index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [77]:
# explicit index when indexing
data[1]

'a'

In [78]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

Because of this confusion, Pandas provides special indexer attributes that exposes a particular slicing interface to the data in the Series.

In [79]:
# loc - explicit index
data.loc[1]

'a'

In [80]:
# loc - explicit index
data.loc[1:3]

1    a
3    b
dtype: object

In [81]:
# iloc - implicit integer index
data.iloc[1]

'b'

In [82]:
# iloc - implicit integer index
data.iloc[1:3]

3    b
5    c
dtype: object

## Data Selection in DataFrames

### DataFrame as dictionary

In [83]:
area = pd.Series(
    {
        "California": 423967,
        "Texas": 695662,
        "New York": 141297,
        "Florida": 170312,
        "Illinois": 149995,
    }
)


In [84]:
pop = pd.Series(
    {
        "California": 38332521,
        "Texas": 26448193,
        "New York": 19651127,
        "Florida": 19552860,
        "Illinois": 12882135
    }
)


In [85]:
data = pd.DataFrame({"area": area, "pop": pop})
data


Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [86]:
# access individual series dictionary style
data["area"]

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [87]:
# access individual series attribute style
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [88]:
# modify the object
data["density"] = data["pop"] / data["area"]
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


### DataFrame as 2D array

In [91]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [89]:
# look at the underlying data
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [90]:
# transpose the data
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [92]:
# index rows and columns by implicit integer index (exclusive)
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [93]:
# index rows and columns by explicit index (inclusive)
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [None]:
# hybrid approach - however, this be confusing with integer indices
data.ix[:3, :'pop']