## Pandas and Numerical Data Analysis

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# make life simple
from pandas import Series, DataFrame

In [6]:
# Pandas has additional data structures - Series and DataFrame
s = pd.Series([4, 7, -5, 3]) # a series from a list
s # the members all have an implicit index
s.values # an array
s.index # RangeIndex

RangeIndex(start=0, stop=4, step=1)

In [11]:
s = Series([6,5,4,3,2])
s.values

array([6, 5, 4, 3, 2], dtype=int64)

In [18]:
# we can provide our own index for series
s2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
s2.index
s2.values
s2
s2['a'] # -5

-5

In [22]:
# we can filter members
s2[s2>0]

d    4
b    7
c    3
dtype: int64

In [24]:
# obvious maths - very efficient and performant
s2**3

d     64
b    343
a   -125
c     27
dtype: int64

In [27]:
# we can look for members being present
'b' in s2
'f' in s2

False

In [29]:
# we can make Python structures into Series
s_dict = {'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
s3 = Series(s_dict)
s3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [35]:
states = ['California', 'Ohio', 'Oregon', 'Texas'] #, 'Utah']
s4 = Series(s_dict, index=states)
s4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [41]:
# NaN represents the absence of a number
# but .... every column must contain consistent data type...
# therefore NaN is a numeric type!!!
type(s4['California']) # see!

numpy.float64

In [46]:
# we can spot spurious data members in our structures
pd.isnull(s4)# null
pd.isna(s4) # not available
pd.notnull(s4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [49]:
# maths!!
s4+s3 # NaN if missing from Either side

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [51]:
# we can name our index!
s3.index.name = 'State'
s3.name = 'Efficiency'
s3

State
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: Efficiency, dtype: int64