In [41]:
# import the library
import pandas as pd
import numpy as np


# Series

In [3]:
# check the version
pd.__version__


'1.3.4'

In [5]:
# pandas series is a 1D array of indexed data
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data


0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [8]:
# pandas series have 'values'
data.values


array([0.25, 0.5 , 0.75, 1.  ])

In [9]:
# and 'index' attributes (type 'pd.Index')
data.index


RangeIndex(start=0, stop=4, step=1)

In [11]:
# you can use the index to slice and access parts of the pandas series
data[1]


0.5

In [12]:
data[1:3]


1    0.50
2    0.75
dtype: float64

The main difference between a NumPy array and a Pandas series object is the index.
* NumPy arrays have **implicitly defined** integer indices.
* Pandas series have **explicitly defined** indices.

In [14]:
# pd.Series(data=data, index=index)
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a", "b", "c", "d"])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [15]:
data["b"]


0.5

In [16]:
# note, you can also use noncontiguous or nonsequential indices
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[3, 5, 6, 1])
data


3    0.25
5    0.50
6    0.75
1    1.00
dtype: float64

In [17]:
data[1]


1.0

In [20]:
test = pd.Series(["Frog", 1, True])


In [23]:
# you can think of series (typed keys) as a specialized version of the dictionary (arbitrary keys)
population_dict = {
    "California": 120,
    "Texas": 253,
    "New York": 196,
    "Florida": 102,
    "Illinois": 100,
}

population = pd.Series(population_dict)
population


California    120
Texas         253
New York      196
Florida       102
Illinois      100
dtype: int64

In [24]:
# the index is drawn from sorted keys
population["California"]


120

In [25]:
population["California":"New York"]


California    120
Texas         253
New York      196
dtype: int64

# DataFrames

In [28]:
area_dict = {
    "California": 420,
    "Texas": 120,
    "New York": 1209,
    "Florida": 1092,
    "Illinois": 1092,
}
area_dict


{'California': 420,
 'Texas': 120,
 'New York': 1209,
 'Florida': 1092,
 'Illinois': 1092}

In [29]:
area = pd.Series(area_dict)
area

California     420
Texas          120
New York      1209
Florida       1092
Illinois      1092
dtype: int64

In [31]:
# let's make a dataframe from two series
states = pd.DataFrame({"population": population, "area": area})
states

Unnamed: 0,population,area
California,120,420
Texas,253,120
New York,196,1209
Florida,102,1092
Illinois,100,1092


In [35]:
# dataframe object has an index 
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [36]:
# it also has a columns attribute for column labels
states.columns

Index(['population', 'area'], dtype='object')

In [37]:
# a dataframe maps a column name to a series object of column data
states['area']

California     420
Texas          120
New York      1209
Florida       1092
Illinois      1092
Name: area, dtype: int64

In [38]:
# dataframes can be constructed from a list of dictionaries
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [40]:
# if some keys in the dictionary are missing, pandas will fill with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]) # each item represents an observation

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


# Indices

In [42]:
# index can be thought of as an immutable array or as an ordered set
ind = pd.Index([2,3,4,5])
ind

Int64Index([2, 3, 4, 5], dtype='int64')

In [43]:
ind[1]

3

In [44]:
ind[::2]

Int64Index([2, 4], dtype='int64')

In [45]:
# indices have attributes similar to NumPy arrays
print(ind.size, ind.shape, ind.ndim, ind.dtype)


4 (4,) 1 int64


In [47]:
# index values cannot be modified by assignment (immutable)
ind[1] = 12

TypeError: Index does not support mutable operations

In [53]:
# index as an ordered set 
indA = pd.Index([3,5,7,9])
indB = pd.Index([2,3,5,7,11])


In [54]:
# intersection of two indicies
indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

In [55]:
# union of two indices
indA.union(indB)

Int64Index([2, 3, 5, 7, 9, 11], dtype='int64')

In [56]:
# symmetric difference of indices
indA.symmetric_difference(indB)

Int64Index([2, 9, 11], dtype='int64')

# Indexing and Selection

## Data Selection in Series

### Series as dictionary

In [60]:
# map a collection of keys (index) to values
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=["a", "b", "c", "d"])
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [62]:
# access values through keys
data["b"]

0.5

In [63]:
# check if something exists
'a' in data

True

In [64]:
# look at the keys
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [65]:
# list the key-value pairs
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [66]:
# modify series objects
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### Series as a 1D array

In [68]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [67]:
# slice by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [69]:
# slice by implicit integer index
data[:2]

a    0.25
b    0.50
dtype: float64

**NOTE** 

When you are slicing with an ***explicit*** index (data['a':'c']), the final index is ***included***.

When you are slicing with an ***implicit*** index (data[0:4]), the final index is ***excluded***.

In [70]:
# masking
data[(data > 0.3) % (data < 0.8)]

a    0.25
a    0.25
a    0.25
a    0.25
a    0.25
dtype: float64

In [71]:
# fancy indexing
data[['a','e']]

a    0.25
e    1.25
dtype: float64

### Indexers: loc, iloc, and ix

In [73]:
data = pd.Series(['a','b','c'], index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [74]:
# explicit index when indexing
data[1]

'a'

In [75]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

Because of this confusion, Pandas provides special indexer attributes that exposes a particular slicing interface to the data in the Series.

In [None]:
# loc - explicit index