# Pandas

In [26]:
# we need the usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd # then pd.Series, pd.DataFrame etc.
from pandas import Series, DataFrame

### CAREFUL 
### - make sure to run all cells above before running other cells!!!

In [4]:
# series
obj = Series([4, 7, -5, 3]) # all the same data type
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values # a numpy array
obj.index

RangeIndex(start=0, stop=4, step=1)

In [17]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a','c'])
obj2['a'] # can access by index
obj2['c'] = 99 # mutate
obj2['e'] = 88 # assign
obj2[['c', 'a']] # access just some members
obj2[obj2>6] # filter
obj2*2
'b' in obj2

True

### using series

In [23]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [24]:
states = ['Ohio', 'Texas', 'Oregon', 'California']
obj4 = Series(sdata, index=states)
obj4 # drop unlisted states, and not-a-number for no-value state members

Ohio          35000.0
Texas         71000.0
Oregon        16000.0
California        NaN
dtype: float64

In [27]:
pd.isnull(obj4) # dentify members which evaluate as null

Ohio          False
Texas         False
Oregon        False
California     True
dtype: bool

In [28]:
obj3 + obj4 # any maths on NaN gives NaN

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

## DataFrame

In [None]:
# the data frame is a collection of series
# and series are based on numpy arange
# data frames are similar to spreadsheets

In [45]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame # DataFrames make the data look pretty
frame.head(3) # defaults to showing 5 members
frame.tail()
frame2 = pd.DataFrame(data, columns=['year', 'pop', 'state'], index=['one', 'two', 'three', 'four', 'five', 'six'])
frame.columns # each columns is a Series object
frame['pop'] # pick one series (column) from the data frame

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [47]:
frame2.loc['three'] # pick a ROW (a location)

year     2002
pop       3.6
state    Ohio
Name: three, dtype: object

In [49]:
frame2.iloc[0] # iloc is index-location, i.e. rows by numeric index

year     2000
pop       1.5
state    Ohio
Name: one, dtype: object

In [60]:
frame2['debt'] = np.arange(6.) # we can add a new column (a new series)
# we can pick and choose wich values to assign
values = pd.Series([-1.2, 1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = values
frame2.T # show the data frame transposed

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
debt,,-1.2,,1.5,-1.7,


### Indexing

In [63]:
obj3 = pd.Series(range(3), index=['a', 'b', 'c'])
obj3.index

Index(['a', 'b', 'c'], dtype='object')

In [64]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [75]:
# we can reindex
obj3.reindex(labels)
obj3 = obj3.reindex(['c', 'a', 'b']) # the changes only persist if you assign them to something
obj3

c    2
a    0
b    1
dtype: int64

### filtering

In [78]:
obj4 = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj4[1] # or obj4['b'] ie by index or by numeric ordinal value
obj4[2:4]
obj4[obj4<2]
obj3['b':'c'] # there is no series from b to c

Series([], dtype: int64)

In [80]:
obj4['a':'c'] = 5
obj4

a    5.0
b    5.0
c    5.0
d    3.0
dtype: float64

### filtering data frame

In [87]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data[['two','three']] # careful with brackets here
data[data['three']>5] # filtering
# can be useful for dealing with outlier data
data[data['three']>5] = 0 # applies to ALL columns
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,0,0,0,0
Utah,0,0,0,0
New York,0,0,0,0
