## Pandas  Python And Numeric Data Analysis Services

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt

In [2]:
s = pd.Series([4, 7, -5, 3]) # series made from a list
s, s.values, s.index

(0    4
 1    7
 2   -5
 3    3
 dtype: int64,
 array([ 4,  7, -5,  3], dtype=int64),
 RangeIndex(start=0, stop=4, step=1))

In [3]:
# we can index series by non-numeric values
s2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
s2, s2.index

(d    4
 b    7
 a   -5
 c    3
 dtype: int64,
 Index(['d', 'b', 'a', 'c'], dtype='object'))

In [4]:
# we can mutate members of our series
s2['a'] = -99
s2

d     4
b     7
a   -99
c     3
dtype: int64

In [5]:
s2[s2>0]

d    4
b    7
c    3
dtype: int64

In [6]:
s2**3
np.exp(s2)

d    5.459815e+01
b    1.096633e+03
a    1.011221e-43
c    2.008554e+01
dtype: float64

In [7]:
'b' in s2 # True
'f' in s2

False

## Working with Series

In [8]:
# a dict
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
s3 = pd.Series(sdata)
s3
# we can use index
states = ['California', 'Ohio', 'Oregon', 'Texas']
s4 = pd.Series(sdata, index=states)
s4


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [9]:
type(s4['California']) # clearly NaN is actually a number-type, here, float64

numpy.float64

In [10]:
# data cleansing - we can spot spurious values
pd.isnull(s4)
pd.notnull(s4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [11]:
# combine series
s4+s3 # not being persisted!

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [12]:
# we can bring some order by naming our structures
s3.name = 'Efficiency'
s3.index.name='State'
s3

State
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: Efficiency, dtype: int64

## Pandas also has a DataFrame structure

In [13]:
# A DataFrame is a ollection of one or more Pandas series
# any column of a DataFrame MUST contain a single data type


In [14]:
# a dict of lists
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
# we can convert it into a DataFrame
df = pd.DataFrame(data)
df.head(3), df.tail()
df.count() # also min, max, mean etc.
df.describe() # can call decribe on any data frame
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [15]:
# we can deal with members of the df
df2 = DataFrame(data, columns=['year', 'pop', 'state', 'debt'],
               index=['oldest', 'old', 'middle', 'recent', 'penultimate', 'latest'])
df2
# df2.year # returns a series from that column

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,
old,2001,1.7,Ohio,
middle,2002,3.6,Ohio,
recent,2001,2.4,Nevada,
penultimate,2002,2.9,Nevada,
latest,2003,3.2,Nevada,


In [16]:
# we can select rows using 'loc'
df2.loc['middle'] # again, we get a series

year     2002
pop       3.6
state    Ohio
debt      NaN
Name: middle, dtype: object

In [17]:
# we can populate the debt column
df2.debt = 26.5
df2.debt = np.arange(6.) # we hae floats as the data type
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,0.0
old,2001,1.7,Ohio,1.0
middle,2002,3.6,Ohio,2.0
recent,2001,2.4,Nevada,3.0
penultimate,2002,2.9,Nevada,4.0
latest,2003,3.2,Nevada,5.0


In [18]:
# we can selectively populate values
vals = pd.Series([-1.2, -1.5, -1.7], index=['oldest', 'latest', 'recent'])
vals
df2.debt = vals
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,-1.2
old,2001,1.7,Ohio,
middle,2002,3.6,Ohio,
recent,2001,2.4,Nevada,-1.7
penultimate,2002,2.9,Nevada,
latest,2003,3.2,Nevada,-1.5


In [23]:
# we can insert additional data columns
df2['eastern']  = df2.state=='Ohio' # we MUST use square-bracket notation to add columns
df2
# mini-challenge - how to show debt per population
df2['ratio'] = df2.debt/df2['pop'] # careful  -pop is  method!!
df2

Unnamed: 0,year,pop,state,debt,eastern,ratio
oldest,2000,1.5,Ohio,-1.2,True,-0.8
old,2001,1.7,Ohio,,True,
middle,2002,3.6,Ohio,,True,
recent,2001,2.4,Nevada,-1.7,False,-0.708333
penultimate,2002,2.9,Nevada,,False,
latest,2003,3.2,Nevada,-1.5,False,-0.46875


In [24]:
df2.T # we can transpose a DataFrame

Unnamed: 0,oldest,old,middle,recent,penultimate,latest
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
debt,-1.2,,,-1.7,,-1.5
eastern,True,True,True,False,False,False
ratio,-0.8,,,-0.708333,,-0.46875
