In [1]:
from pandas import Series, DataFrame, Index
import numpy as np

## Data Structures: Series, DataFrames and objects

### Series
    A one-dim array-like object containing an array of data and associated array of labels.

In [2]:
Series([1,2,3]) # no index specified

0    1
1    2
2    3
dtype: int64

In [3]:
obj2 = Series([1,2,3], index=["a","b","c"])
obj2

a    1
b    2
c    3
dtype: int64

###### The values and indexes are stored in .index and .values attributes

In [4]:
print "Indexes: ", obj2.index, "Values: ", obj2.values

Indexes:  Index([u'a', u'b', u'c'], dtype='object') Values:  [1 2 3]


##### Accessing elements

In [5]:
print obj2['a']
obj2['a'] = 10

print obj2[['a', 'b']]

1
a    10
b     2
dtype: int64


##### Numpy array operations preserve the index

In [6]:
print obj2[obj2>0]

a    10
b     2
c     3
dtype: int64


In [7]:
print obj2*2

a    20
b     4
c     6
dtype: int64


##### It can be also used in functions that expect a dict
You can think about it as a fixed-length ordered dict

In [8]:
'b' in obj2

True

In [9]:
obj3 = Series({'a': 2, 'b': 3, 'c': 4})
print obj3

a    2
b    3
c    4
dtype: int64


The problem with passing a dict is that the order of the indexes may be different.
So you can pass a list of indexes to specify the order.

In [10]:
obj4 = Series({'a': 2, 'b': 3, 'c': 4}, index=['c', 'b', 'a', 'x'])
print obj4

c    4.0
b    3.0
a    2.0
x    NaN
dtype: float64


'x' is not in the data dict so it is created as NaN. You can find missing data with .isnull function. There is also .notnull to find no null objects in the serie.

In [11]:
# The order can be changed after it's created. 
# but you can NOT add an inexistent value as we did with 'x'
obj5 = Series({'a': 2, 'b': 3, 'c': 4})
obj5.index = ['c', 'b', 'a']
print obj4

c    4.0
b    3.0
a    2.0
x    NaN
dtype: float64


In [12]:
obj4.isnull()

c    False
b    False
a    False
x     True
dtype: bool

In [13]:
obj4.notnull()

c     True
b     True
a     True
x    False
dtype: bool

##### Arithmetic operations

In [14]:
obj4 + obj4

c    8.0
b    6.0
a    4.0
x    NaN
dtype: float64

#### The .name property

In [15]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
c    4.0
b    3.0
a    2.0
x    NaN
Name: population, dtype: float64

### DataFrame
        A spreadsheet-like data structure containing an ordered collection of columns. It has rows and column indexes.
        It can be thought as a dict of series.

In [16]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [17]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


you can pass the column order and the index if you want to use a different one

In [18]:
frame2 = DataFrame(data, columns=['year', 'pop', 'state', 'debt'], 
                   index=['one','two','three','four','five',])
frame2

Unnamed: 0,year,pop,state,debt
one,2000,1.5,Ohio,
two,2001,1.7,Ohio,
three,2002,3.6,Ohio,
four,2001,2.4,Nevada,
five,2002,2.9,Nevada,


##### Retrieving **columns** as Series

In [19]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [20]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

##### Retrieving **rows** as series.
Use .ix

In [21]:
frame2.ix['three']

year     2002
pop       3.6
state    Ohio
debt      NaN
Name: three, dtype: object

##### Assigning values

In [22]:
frame2.debt = 16.5
frame2

Unnamed: 0,year,pop,state,debt
one,2000,1.5,Ohio,16.5
two,2001,1.7,Ohio,16.5
three,2002,3.6,Ohio,16.5
four,2001,2.4,Nevada,16.5
five,2002,2.9,Nevada,16.5


In [23]:
frame2['debt'] = np.arange(5) # the length must match the length of the DataFrame
frame2

Unnamed: 0,year,pop,state,debt
one,2000,1.5,Ohio,0
two,2001,1.7,Ohio,1
three,2002,3.6,Ohio,2
four,2001,2.4,Nevada,3
five,2002,2.9,Nevada,4


Assigning a column that doesn't exist will create a new column.

In [24]:
frame2['eastern'] = frame2.state == 'Ohio'

In [25]:
frame2

Unnamed: 0,year,pop,state,debt,eastern
one,2000,1.5,Ohio,0,True
two,2001,1.7,Ohio,1,True
three,2002,3.6,Ohio,2,True
four,2001,2.4,Nevada,3,False
five,2002,2.9,Nevada,4,False


But it won't work if you do assign it as a property

In [26]:
frame2.eastern2 = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,pop,state,debt,eastern
one,2000,1.5,Ohio,0,True
two,2001,1.7,Ohio,1,True
three,2002,3.6,Ohio,2,True
four,2001,2.4,Nevada,3,False
five,2002,2.9,Nevada,4,False


In [27]:
frame2.eastern2

one       True
two       True
three     True
four     False
five     False
Name: state, dtype: bool

In [28]:
del frame2.eastern2
del frame2['eastern']
frame2.columns

Index([u'year', u'pop', u'state', u'debt'], dtype='object')

The columns are views on the underlying data, not a copy. Any, in-place modifications to the Series will be reflected in the DataFrame.

##### Creating DataFrames as dict of dicts

In [29]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [30]:
# transpose
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


##### .name properties

In [31]:
frame3.index.name = 'year'
frame3.columns.name= 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


##### .values attribute

In [32]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

### Index objects

    Any array or other sequence used when constructing a Series or DataFrame is internally converted to an Index

In [33]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index([u'a', u'b', u'c'], dtype='object')

Indexes are immutable. They can't be modified by the user.

In [34]:
index[1] = 'd'

TypeError: Index does not support mutable operations

#### Main index objects
**Index:** The most general Index object, representing axis labels in a NumPy array of Python objects. 

**Int64Index:** Specialized Index for integer values.

**MultiIndex:** “Hierarchical” index object representing multiple levels of indexing on a single axis. Can be thought of as similar to an array of tuples.

**DatetimeIndex:** Stores nanosecond timestamps (represented using NumPy’s datetime64 dtype).

**PeriodIndex:** Specialized Index for Period data (timespans).
  

#### An index also functions as a fixed-size set.

In [36]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [37]:
frame3.index

Int64Index([2000, 2001, 2002], dtype='int64', name=u'year')

In [41]:
2002 in frame3.index

True

## Essential functionalify

##### Reindexing .reindex
Calling reindex rearranges the data accorinng to the new index introducing missing values.

In [44]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [46]:
obj.reindex(['a','b','c','d','e',], fill_value=0) # 0 for non-existent values

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [47]:
obj.reindex(['a','b','c','d','e',], fill_value=0) # 0 for non-existent values

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. 
The method option allows us to do this, using a method such as ffill which forward fills the values

In [51]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

#### Droping indexes

In [53]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj.drop('b')

a    0.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [54]:
obj.drop(['a', 'c'])

b    1.0
d    3.0
e    4.0
dtype: float64

On DataFrames

In [59]:
data = DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [57]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [56]:
data.drop(['two', 'four'], axis=1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


#### Indexing, selection, and filtering

In [62]:
data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

In [63]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [64]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [71]:
data[data['one'] > 5]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [72]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


#### For rows

In [78]:
data.ix ['Colorado', 'two']

5

In [79]:
data.ix['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

### Operations between DataFrame and Series

In [82]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [85]:
arr - arr[0] ## the arrs has different sizes, broadcasting is being used

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

In [88]:
## This is similar to operations between frames and series

frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [89]:
series = frame.ix[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [90]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, broadcasting down the rows