# 10 Minutes to pandas

In [4]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

## Object Creation

Create a **Series** by passing a list of values, letting pandas create a default integer index: 

In [5]:
s = pd.Series([1,2,3,np.nan,6,8])

In [6]:
print s

0    1.0
1    2.0
2    3.0
3    NaN
4    6.0
5    8.0
dtype: float64


 Create a **DataFrame** by passing a numpy array, with a datetime index and labeled columns:

In [7]:
dates = pd.date_range('20130101', periods=6)

In [8]:
print dates # creates a set of dates 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [9]:
df = pd.DataFrame(np.random.rand(6,4),index=dates, 
                  columns=list('ABCD'))

In [10]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.000795,0.076557,0.672962,0.496449
2013-01-02,0.873698,0.626829,0.622865,0.691792
2013-01-03,0.27969,0.151588,0.781833,0.931227
2013-01-04,0.023688,0.518364,0.800006,0.290771
2013-01-05,0.057256,0.878302,0.253035,0.845052
2013-01-06,0.571096,0.195835,0.82522,0.615333


Create a DataFrame by passing a dict of objects that can be converted to series-like object.

In [14]:
df2 = pd.DataFrame({'A':1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test","train","test", "train"]),
                    'F': 'foo' })

In [12]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [21]:
df2.dtypes # get the types of data for each column

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

In [24]:
print df

                   A         B         C         D
2013-01-01  0.000795  0.076557  0.672962  0.496449
2013-01-02  0.873698  0.626829  0.622865  0.691792
2013-01-03  0.279690  0.151588  0.781833  0.931227
2013-01-04  0.023688  0.518364  0.800006  0.290771
2013-01-05  0.057256  0.878302  0.253035  0.845052
2013-01-06  0.571096  0.195835  0.825220  0.615333


In [25]:
df.head() # top rows of the dataframe

Unnamed: 0,A,B,C,D
2013-01-01,0.000795,0.076557,0.672962,0.496449
2013-01-02,0.873698,0.626829,0.622865,0.691792
2013-01-03,0.27969,0.151588,0.781833,0.931227
2013-01-04,0.023688,0.518364,0.800006,0.290771
2013-01-05,0.057256,0.878302,0.253035,0.845052


In [27]:
df.tail(3) # last three rows of the dataframe

Unnamed: 0,A,B,C,D
2013-01-04,0.023688,0.518364,0.800006,0.290771
2013-01-05,0.057256,0.878302,0.253035,0.845052
2013-01-06,0.571096,0.195835,0.82522,0.615333


Display the index, columns and the underlying numpy data

In [28]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [29]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [30]:
df.values

array([[  7.94523207e-04,   7.65566649e-02,   6.72962227e-01,
          4.96448830e-01],
       [  8.73698026e-01,   6.26829326e-01,   6.22864815e-01,
          6.91792229e-01],
       [  2.79690455e-01,   1.51588436e-01,   7.81832789e-01,
          9.31227069e-01],
       [  2.36880528e-02,   5.18364408e-01,   8.00006144e-01,
          2.90771328e-01],
       [  5.72555360e-02,   8.78301682e-01,   2.53034764e-01,
          8.45051892e-01],
       [  5.71095682e-01,   1.95835019e-01,   8.25219509e-01,
          6.15332907e-01]])

You can also get a quick statistic summary of your data

In [31]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.301037,0.407913,0.65932,0.645104
std,0.354352,0.316817,0.213966,0.233479
min,0.000795,0.076557,0.253035,0.290771
25%,0.03208,0.16265,0.635389,0.52617
50%,0.168473,0.3571,0.727398,0.653563
75%,0.498244,0.599713,0.795463,0.806737
max,0.873698,0.878302,0.82522,0.931227


In [34]:
df.T # transpose of the data (doesn't alter dataframe)

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.000795,0.873698,0.27969,0.023688,0.057256,0.571096
B,0.076557,0.626829,0.151588,0.518364,0.878302,0.195835
C,0.672962,0.622865,0.781833,0.800006,0.253035,0.82522
D,0.496449,0.691792,0.931227,0.290771,0.845052,0.615333


In [33]:
print df

                   A         B         C         D
2013-01-01  0.000795  0.076557  0.672962  0.496449
2013-01-02  0.873698  0.626829  0.622865  0.691792
2013-01-03  0.279690  0.151588  0.781833  0.931227
2013-01-04  0.023688  0.518364  0.800006  0.290771
2013-01-05  0.057256  0.878302  0.253035  0.845052
2013-01-06  0.571096  0.195835  0.825220  0.615333


In [36]:
df.sort_index(axis=1, ascending=False) # Sort cols descend

Unnamed: 0,D,C,B,A
2013-01-01,0.496449,0.672962,0.076557,0.000795
2013-01-02,0.691792,0.622865,0.626829,0.873698
2013-01-03,0.931227,0.781833,0.151588,0.27969
2013-01-04,0.290771,0.800006,0.518364,0.023688
2013-01-05,0.845052,0.253035,0.878302,0.057256
2013-01-06,0.615333,0.82522,0.195835,0.571096


In [37]:
df.sort_values(by='B') #sort values by col B

Unnamed: 0,A,B,C,D
2013-01-01,0.000795,0.076557,0.672962,0.496449
2013-01-03,0.27969,0.151588,0.781833,0.931227
2013-01-06,0.571096,0.195835,0.82522,0.615333
2013-01-04,0.023688,0.518364,0.800006,0.290771
2013-01-02,0.873698,0.626829,0.622865,0.691792
2013-01-05,0.057256,0.878302,0.253035,0.845052


## Selection

Standard Python/NumPy expressions for selecting and setting data come in handy for interactive work but for production code it is advised to use optimized pandas data access methods, .at, .iat, .iloc and .ix. You can find more about these in the indexing documentation

### Getting

In [38]:
df['A'] # get a single column of the dataframe

2013-01-01    0.000795
2013-01-02    0.873698
2013-01-03    0.279690
2013-01-04    0.023688
2013-01-05    0.057256
2013-01-06    0.571096
Freq: D, Name: A, dtype: float64

In [39]:
df.A # equivalent to df['A']

2013-01-01    0.000795
2013-01-02    0.873698
2013-01-03    0.279690
2013-01-04    0.023688
2013-01-05    0.057256
2013-01-06    0.571096
Freq: D, Name: A, dtype: float64

We can also do numpy slicing like operations

In [40]:
df[0:3] # get rows 0 upto 3

Unnamed: 0,A,B,C,D
2013-01-01,0.000795,0.076557,0.672962,0.496449
2013-01-02,0.873698,0.626829,0.622865,0.691792
2013-01-03,0.27969,0.151588,0.781833,0.931227


We can also use the index to slice the dataframe

In [41]:
df['20130102':'20130104'] # This is really awesome

Unnamed: 0,A,B,C,D
2013-01-02,0.873698,0.626829,0.622865,0.691792
2013-01-03,0.27969,0.151588,0.781833,0.931227
2013-01-04,0.023688,0.518364,0.800006,0.290771


### Selecting by label

In [42]:
print df

                   A         B         C         D
2013-01-01  0.000795  0.076557  0.672962  0.496449
2013-01-02  0.873698  0.626829  0.622865  0.691792
2013-01-03  0.279690  0.151588  0.781833  0.931227
2013-01-04  0.023688  0.518364  0.800006  0.290771
2013-01-05  0.057256  0.878302  0.253035  0.845052
2013-01-06  0.571096  0.195835  0.825220  0.615333


In [43]:
df.loc[dates[0]] # gets a cross-section using a label

A    0.000795
B    0.076557
C    0.672962
D    0.496449
Name: 2013-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [44]:
df.loc[:,['A','B']] # get all rows for cols A and B

Unnamed: 0,A,B
2013-01-01,0.000795,0.076557
2013-01-02,0.873698,0.626829
2013-01-03,0.27969,0.151588
2013-01-04,0.023688,0.518364
2013-01-05,0.057256,0.878302
2013-01-06,0.571096,0.195835


Can do the same using the date index for slicing. Notice that both endpts are included.

In [45]:
df.loc['20130102':'20130105',['A','B']]

Unnamed: 0,A,B
2013-01-02,0.873698,0.626829
2013-01-03,0.27969,0.151588
2013-01-04,0.023688,0.518364
2013-01-05,0.057256,0.878302


If there is only one data index selected the number of dimensions will be reduced. Ex:

In [46]:
df.loc['20130103',['A','B']] # Dimensional reduction

A    0.279690
B    0.151588
Name: 2013-01-03 00:00:00, dtype: float64

In [47]:
df.loc['20130103','A']

0.27969045520844227

An even faster method is the use the at method.

In [50]:
df.at[dates[0],'A']

0.00079452320726991488

### Selection by Position

Select via the position of the passed integers

In [52]:
print df

                   A         B         C         D
2013-01-01  0.000795  0.076557  0.672962  0.496449
2013-01-02  0.873698  0.626829  0.622865  0.691792
2013-01-03  0.279690  0.151588  0.781833  0.931227
2013-01-04  0.023688  0.518364  0.800006  0.290771
2013-01-05  0.057256  0.878302  0.253035  0.845052
2013-01-06  0.571096  0.195835  0.825220  0.615333


In [53]:
df.iloc[3] # gets the row with index 3

A    0.023688
B    0.518364
C    0.800006
D    0.290771
Name: 2013-01-04 00:00:00, dtype: float64

In [59]:
df.iloc[3:5,0:2] # use iloc for dataframe slicing by 
#integer positions

Unnamed: 0,A,B
2013-01-04,0.023688,0.518364
2013-01-05,0.057256,0.878302


Can also pass list of integer positions like numpy style

In [58]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.873698,0.622865
2013-01-03,0.27969,0.781833
2013-01-05,0.057256,0.253035


In [61]:
df.iloc[1:3,:] #similar row slicing indexing to numpy

Unnamed: 0,A,B,C,D
2013-01-02,0.873698,0.626829,0.622865,0.691792
2013-01-03,0.27969,0.151588,0.781833,0.931227


In [63]:
df.iloc[:,1:3] #similar col slicing indexing to numpy

Unnamed: 0,B,C
2013-01-01,0.076557,0.672962
2013-01-02,0.626829,0.622865
2013-01-03,0.151588,0.781833
2013-01-04,0.518364,0.800006
2013-01-05,0.878302,0.253035
2013-01-06,0.195835,0.82522


Get single scalar values too.

In [64]:
df.iloc[1,1]

0.6268293258176223

### Boolean Indexing