# Introduction to Pandas

In [1]:
# we need to import the tools
import numpy as np
import pandas as pd

In [5]:
# Pandas Series
o = pd.Series([4, 7, -5 ,3]) # all the same data type
o # see the indexed series of values
o.values
o.index

RangeIndex(start=0, stop=4, step=1)

In [15]:
o2= pd.Series([4, 7, -5 ,3], index=['d', 'b', 'a', 'c'])
o2['a'] = -7
o2['e'] = 99
o2[['c', 'a']] # careful - the slice must be a list
o2[ o2>6 ]
o2**3
o2 # the original series renmains unchanged

d     4
b     7
a    -7
c     3
e    99
dtype: int64

In [25]:
# using Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
o3 = pd.Series(sdata)
o3
# or
states = ['Ohio', 'Texas', 'California', 'Utah', 'Oregon']
o4 = pd.Series(sdata, index=states)
o4
pd.isnull(o4)
o3+o4 # any maths involving NaN results in NaN

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah           10000.0
dtype: float64

## The Pandas Data Frame

In [None]:
# the Data Frame is a collection of Pandas Series
# series are based on numpy arange
# Data Frame is a bit like a spreadsheet
# each column is a Pandas Series


In [26]:
# here is some data
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame # each column can be of a single data type

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [29]:
frame.describe() # get some statistics from our data

Unnamed: 0,year,pop
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [34]:
frame.head(2)
frame.tail(3)

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [50]:
frame2 = pd.DataFrame(data, columns=['year', 'pop', 'state'] # pick which columns in which order
                      , index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2.columns # see what columns we have
frame2['pop']# just the pop column - which is a Series
frame2

Unnamed: 0,year,pop,state
one,2000,1.5,Ohio
two,2001,1.7,Ohio
three,2002,3.6,Ohio
four,2001,2.4,Nevada
five,2002,2.9,Nevada
six,2003,3.2,Nevada


### Using loc and iloc to access rows

In [51]:
# loc lets us locate a data member by it's ordinal numerical position (the original index)
frame2.loc['three']
# iloc is the actual ordinal numeric position
frame2.iloc[0]

year     2000
pop       1.5
state    Ohio
Name: one, dtype: object

### Manipulating Data Frames

In [53]:
# we can inject new columns to augment our data
frame2['Debt'] = np.arange(6.) # nb here we  are forcing floats
frame2
# we can assign our own values to 'debt'
values = pd.Series([-1.2, 1.5, -2.7], index=['two', 'four', 'five'])
frame2['Debt'] = values # overwrite some of the original values
frame2.T # transpose

Unnamed: 0,one,two,three,four,five,six
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
Debt,,-1.2,,1.5,-2.7,


## Indexing Series

In [63]:
o3 = pd.Series(range(3), index=['a', 'b', 'c'])
o3
# o3.index

a    0
b    1
c    2
dtype: int64

In [57]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [65]:
# we can use our 'labels' to re-index our original Series
o4 = o3.reindex(labels) # does not persists in o3!!
o4 = o3.reindex(['c', 'a', 'b']) # match the existing index so preserve the existing values
o4

c    2
a    0
b    1
dtype: int64

In [69]:
# we can filter and slice Series
o4[ o4<2 ]
o4[0:3:1]

c    2
a    0
b    1
dtype: int64

## Indexing and Filtering Data Frames

In [83]:
i = ['Ohio', 'Colorado', 'Utah', 'New York']
c = ['one', 'two', 'three', 'four']
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=i, columns=c)
data
# we can see slices of our data frame
data[['two', 'three']]  # does not persist
# we can filter the data frame
data[ data['three']>12 ] = 0 # we can spot outliers in our data
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,0,0,0,0
