In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## Series
A Series is a one-dimensional array-like object containing an array of data (of any NumPy data type) and an associated array data of data labels, called <i>index</i>.

In [3]:
obj = Series([4, 7, -5, 3])

In [4]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

The string representation of a Series displayed interactively shows the index on left and values on right. Since we didn't specify an index for the data, a default one consuming of the integers 0 through N-1 (where N is the length of the data) is created.

In [5]:
# Array representation
obj.values

array([ 4,  7, -5,  3])

In [6]:
# Index object
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [8]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
# Compared with a regular NumPy array, we can use values in the index when selecting single values or a set of values
obj2['a']

-5

In [10]:
obj2['d'] = 6

In [11]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [12]:
# NumPy operations such as filtering with a boolean array, scaler multiplication, or applying math functions
# will preserve the index-value link
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [13]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [14]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [15]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [16]:
# Another way to think about Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values.
# It can be substituted into many functions that expect a dict
'b' in obj2

True

In [17]:
'e' in obj2

False

In [18]:
# Should you have conatined in a Python dict, you can create a Series from it by passing dict
sdata = {
    'Ohio': 35000,
    'Texas': 71000,
    'Oregon': 16000,
    'Utah': 5000
}

In [19]:
obj3 = Series(sdata)

In [20]:
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [21]:
# When only passing a dict, the index in the resulting Series will hace the dict's keys in sorted order
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)

In [22]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [23]:
# NaN (not a number) is considered in pandas to mark missing or NA values.
# The isnull and notnull functions in pandas should be used to detect missing data.
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [24]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [25]:
# A crictical Series feature is that it automatically aligns differently indexed data in arithmetic operations
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [26]:
# Both the Series object itself and its index have a name attribute, which integrates with other key areas of pandas functionality
obj4.name = 'population'

In [27]:
obj4.index.name = 'state'

In [28]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [29]:
# A serie's index can be altered in place by assignement
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [30]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## Dataframe
A Dataframe represents a tabular, spreadsheet-like data structure containing an ordered collection of columnss, each of which can be a different value type (numeric, string, boolean, etc.). The Dataframe has both a row and column index; it can be thought of as dict of Series (on for all sharing the same index). 

In [31]:
# One form of constructing a Dataframe
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
}
frame = DataFrame(data)

In [32]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [33]:
# Specify a sequence of columns
DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [34]:
# As with Series, if you pass a column that isn't contained in data, it will appear with NA values in result
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five'])

In [35]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [36]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [37]:
# A column in a Dataframe can be retrived as a Series either by dict-like notation or by attribute
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [38]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [39]:
# Rows can also retrevied by postion or name by a couple of method, such as the ix indexing field
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [40]:
# Columns can be modified by assignement
frame2['debt'] = 16.5

In [41]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [42]:
frame2['debt'] = np.arange(5.)

In [43]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [44]:
# When assigning lists or array to a column, the value's length must match the length of the Dataframe.
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five0'])

In [45]:
frame2['debt'] = val

In [46]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,


In [47]:
# Assigning a column that doesn't exist will create a new column.
# The del keyword will delete columns as with a dict
frame2['eastern'] = frame2.state == 'Ohio'

In [48]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,,False


In [49]:
del frame2['eastern']

In [50]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [51]:
# Another common form of data is a nested dict of dicts format
pop = {
    'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}
}

In [52]:
frame3 = DataFrame(pop)

In [53]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [54]:
# Transpose the Result
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [55]:
# The keys in the inner dicts are unioned and sorted to form the index in the result.
# This is not true if an explicit index is specified
DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [56]:
# Dict Series are treated much in the same way
pdata = {
    'Ohio': frame3['Ohio'][:-1],
    'Nevada': frame3['Nevada'][:2]
}

In [57]:
DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


In [58]:
frame3.index.name = 'year';frame3.columns.name = 'state'

In [59]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [60]:
# Like Series, the values attribute returns the data continued in the Dataframe as 2D ndarray
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

In [61]:
# If the DataFrame's column are different dtypes, 
# the dtype of the values array will be chosen to accomdate all of the columns
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, nan]], dtype=object)

## Index Objects
pandas's index objects are responsible for holding the axis labels and other metadata (like the axis name and names). Any array or other sequence of labels used when constructing a Series or DataFrame is internally converted to an Index.

In [62]:
obj = Series(range(3), index=['a', 'b', 'c'])

In [63]:
index = obj.index

In [64]:
index

Index(['a', 'b', 'c'], dtype='object')

In [65]:
index[1:]

Index(['b', 'c'], dtype='object')

In [66]:
# Index object are immutable and thus can't modified by the user
index[1] = 'd'

TypeError: Index does not support mutable operations

In [67]:
# Immutability is important so that index objects can be safely shared among data structures

In [68]:
index = pd.Index(np.arange(3))

In [69]:
obj2 = Series([1.5, -2.5, 0], index=index)

In [70]:
obj2.index is index

True

In [71]:
# In addition to being array like, an Index also functions as a fixed-size set

In [72]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [73]:
'Ohio' in frame3.columns

True

In [74]:
2003 in frame3.index

False