# 10 Minutes to pandas

In [21]:
import pandas as pd

In [22]:
import numpy as np

In [23]:
import matplotlib.pyplot as plt

## Object Creation

Create a **Series** by passing a list of values, letting pandas create a default integer index: 

In [24]:
s = pd.Series([1,2,3,np.nan,6,8])

In [25]:
print s

0     1
1     2
2     3
3   NaN
4     6
5     8
dtype: float64


 Create a **DataFrame** by passing a numpy array, with a datetime index and labeled columns:

In [26]:
dates = pd.date_range('20130101', periods=6)

In [27]:
print dates # creates a set of dates 

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [28]:
df = pd.DataFrame(np.random.randn(6,4),index=dates, 
                  columns=list('ABCD'))

In [29]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.191978,-0.823399,-0.305615,0.941054
2013-01-02,-0.052918,0.488237,1.962247,0.602895
2013-01-03,1.170447,-0.01714,0.473393,-0.298883
2013-01-04,1.35414,-0.804008,-0.269903,0.608692
2013-01-05,-0.423381,0.454897,-0.709064,-0.675499
2013-01-06,0.930289,-0.014755,-1.062659,-0.05519


Create a DataFrame by passing a dict of objects that can be converted to series-like object.

In [30]:
df2 = pd.DataFrame({'A':1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test","train","test", "train"]),
                    'F': 'foo' })

In [31]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1,3,test,foo
1,1,2013-01-02,1,3,train,foo
2,1,2013-01-02,1,3,test,foo
3,1,2013-01-02,1,3,train,foo


In [32]:
df2.dtypes # get the types of data for each column

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

In [33]:
print df

                   A         B         C         D
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054
2013-01-02 -0.052918  0.488237  1.962247  0.602895
2013-01-03  1.170447 -0.017140  0.473393 -0.298883
2013-01-04  1.354140 -0.804008 -0.269903  0.608692
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190


In [34]:
df.head() # top rows of the dataframe

Unnamed: 0,A,B,C,D
2013-01-01,-0.191978,-0.823399,-0.305615,0.941054
2013-01-02,-0.052918,0.488237,1.962247,0.602895
2013-01-03,1.170447,-0.01714,0.473393,-0.298883
2013-01-04,1.35414,-0.804008,-0.269903,0.608692
2013-01-05,-0.423381,0.454897,-0.709064,-0.675499


In [35]:
df.tail(3) # last three rows of the dataframe

Unnamed: 0,A,B,C,D
2013-01-04,1.35414,-0.804008,-0.269903,0.608692
2013-01-05,-0.423381,0.454897,-0.709064,-0.675499
2013-01-06,0.930289,-0.014755,-1.062659,-0.05519


Display the index, columns and the underlying numpy data

In [36]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [37]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [38]:
df.values

array([[-0.19197788, -0.82339851, -0.30561477,  0.94105443],
       [-0.05291761,  0.4882373 ,  1.9622469 ,  0.60289541],
       [ 1.17044692, -0.01713985,  0.47339316, -0.29888284],
       [ 1.35414003, -0.80400824, -0.26990334,  0.60869202],
       [-0.42338132,  0.45489678, -0.70906446, -0.67549948],
       [ 0.93028921, -0.01475461, -1.06265915, -0.05518991]])

You can also get a quick statistic summary of your data

In [39]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.464433,-0.119361,0.014733,0.187178
std,0.773796,0.580474,1.083649,0.625785
min,-0.423381,-0.823399,-1.062659,-0.675499
25%,-0.157213,-0.607291,-0.608202,-0.23796
50%,0.438686,-0.015947,-0.287759,0.273853
75%,1.110407,0.337484,0.287569,0.607243
max,1.35414,0.488237,1.962247,0.941054


In [40]:
df.T # transpose of the data (doesn't alter dataframe)

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.191978,-0.052918,1.170447,1.35414,-0.423381,0.930289
B,-0.823399,0.488237,-0.01714,-0.804008,0.454897,-0.014755
C,-0.305615,1.962247,0.473393,-0.269903,-0.709064,-1.062659
D,0.941054,0.602895,-0.298883,0.608692,-0.675499,-0.05519


In [41]:
print df

                   A         B         C         D
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054
2013-01-02 -0.052918  0.488237  1.962247  0.602895
2013-01-03  1.170447 -0.017140  0.473393 -0.298883
2013-01-04  1.354140 -0.804008 -0.269903  0.608692
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190


In [42]:
df.sort_index(axis=1, ascending=False) # Sort cols descend

Unnamed: 0,D,C,B,A
2013-01-01,0.941054,-0.305615,-0.823399,-0.191978
2013-01-02,0.602895,1.962247,0.488237,-0.052918
2013-01-03,-0.298883,0.473393,-0.01714,1.170447
2013-01-04,0.608692,-0.269903,-0.804008,1.35414
2013-01-05,-0.675499,-0.709064,0.454897,-0.423381
2013-01-06,-0.05519,-1.062659,-0.014755,0.930289


In [43]:
df.sort_values(by='B') #sort values by col B

Unnamed: 0,A,B,C,D
2013-01-01,-0.191978,-0.823399,-0.305615,0.941054
2013-01-04,1.35414,-0.804008,-0.269903,0.608692
2013-01-03,1.170447,-0.01714,0.473393,-0.298883
2013-01-06,0.930289,-0.014755,-1.062659,-0.05519
2013-01-05,-0.423381,0.454897,-0.709064,-0.675499
2013-01-02,-0.052918,0.488237,1.962247,0.602895


## Selection

Standard Python/NumPy expressions for selecting and setting data come in handy for interactive work but for production code it is advised to use optimized pandas data access methods, .at, .iat, .iloc and .ix. You can find more about these in the indexing documentation

### Getting

In [44]:
df['A'] # get a single column of the dataframe

2013-01-01   -0.191978
2013-01-02   -0.052918
2013-01-03    1.170447
2013-01-04    1.354140
2013-01-05   -0.423381
2013-01-06    0.930289
Freq: D, Name: A, dtype: float64

In [45]:
df.A # equivalent to df['A']

2013-01-01   -0.191978
2013-01-02   -0.052918
2013-01-03    1.170447
2013-01-04    1.354140
2013-01-05   -0.423381
2013-01-06    0.930289
Freq: D, Name: A, dtype: float64

We can also do numpy slicing like operations

In [46]:
df[0:3] # get rows 0 upto 3

Unnamed: 0,A,B,C,D
2013-01-01,-0.191978,-0.823399,-0.305615,0.941054
2013-01-02,-0.052918,0.488237,1.962247,0.602895
2013-01-03,1.170447,-0.01714,0.473393,-0.298883


We can also use the index to slice the dataframe

In [47]:
df['20130102':'20130104'] # This is really awesome

Unnamed: 0,A,B,C,D
2013-01-02,-0.052918,0.488237,1.962247,0.602895
2013-01-03,1.170447,-0.01714,0.473393,-0.298883
2013-01-04,1.35414,-0.804008,-0.269903,0.608692


### Selecting by label

In [48]:
print df

                   A         B         C         D
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054
2013-01-02 -0.052918  0.488237  1.962247  0.602895
2013-01-03  1.170447 -0.017140  0.473393 -0.298883
2013-01-04  1.354140 -0.804008 -0.269903  0.608692
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190


In [49]:
df.loc[dates[0]] # gets a cross-section using a label

A   -0.191978
B   -0.823399
C   -0.305615
D    0.941054
Name: 2013-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [50]:
df.loc[:,['A','B']] # get all rows for cols A and B

Unnamed: 0,A,B
2013-01-01,-0.191978,-0.823399
2013-01-02,-0.052918,0.488237
2013-01-03,1.170447,-0.01714
2013-01-04,1.35414,-0.804008
2013-01-05,-0.423381,0.454897
2013-01-06,0.930289,-0.014755


Can do the same using the date index for slicing. Notice that both endpts are included.

In [51]:
df.loc['20130102':'20130105',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.052918,0.488237
2013-01-03,1.170447,-0.01714
2013-01-04,1.35414,-0.804008
2013-01-05,-0.423381,0.454897


If there is only one data index selected the number of dimensions will be reduced. Ex:

In [52]:
df.loc['20130103',['A','B']] # Dimensional reduction

A    1.170447
B   -0.017140
Name: 2013-01-03 00:00:00, dtype: float64

In [53]:
df.loc['20130103','A']

1.1704469211400672

An even faster method is the use the at method.

In [54]:
df.at[dates[0],'A']

-0.19197788117028716

### Selection by Position

Select via the position of the passed integers

In [55]:
print df

                   A         B         C         D
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054
2013-01-02 -0.052918  0.488237  1.962247  0.602895
2013-01-03  1.170447 -0.017140  0.473393 -0.298883
2013-01-04  1.354140 -0.804008 -0.269903  0.608692
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190


In [56]:
df.iloc[3] # gets the row with index 3

A    1.354140
B   -0.804008
C   -0.269903
D    0.608692
Name: 2013-01-04 00:00:00, dtype: float64

In [57]:
df.iloc[3:5,0:2] # use iloc for dataframe slicing by 
#integer positions

Unnamed: 0,A,B
2013-01-04,1.35414,-0.804008
2013-01-05,-0.423381,0.454897


Can also pass list of integer positions like numpy style

In [58]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.052918,1.962247
2013-01-03,1.170447,0.473393
2013-01-05,-0.423381,-0.709064


In [59]:
df.iloc[1:3,:] #similar row slicing indexing to numpy

Unnamed: 0,A,B,C,D
2013-01-02,-0.052918,0.488237,1.962247,0.602895
2013-01-03,1.170447,-0.01714,0.473393,-0.298883


In [60]:
df.iloc[:,1:3] #similar col slicing indexing to numpy

Unnamed: 0,B,C
2013-01-01,-0.823399,-0.305615
2013-01-02,0.488237,1.962247
2013-01-03,-0.01714,0.473393
2013-01-04,-0.804008,-0.269903
2013-01-05,0.454897,-0.709064
2013-01-06,-0.014755,-1.062659


Get single scalar values too.

In [61]:
df.iloc[1,1]

0.48823730474455074

### Boolean Indexing

In [62]:
print df

                   A         B         C         D
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054
2013-01-02 -0.052918  0.488237  1.962247  0.602895
2013-01-03  1.170447 -0.017140  0.473393 -0.298883
2013-01-04  1.354140 -0.804008 -0.269903  0.608692
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190


In [64]:
df[df.A > 0] # Select the rows where the A col is positive

Unnamed: 0,A,B,C,D
2013-01-03,1.170447,-0.01714,0.473393,-0.298883
2013-01-04,1.35414,-0.804008,-0.269903,0.608692
2013-01-06,0.930289,-0.014755,-1.062659,-0.05519


This is called a where operation

In [65]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,,0.941054
2013-01-02,,0.488237,1.962247,0.602895
2013-01-03,1.170447,,0.473393,
2013-01-04,1.35414,,,0.608692
2013-01-05,,0.454897,,
2013-01-06,0.930289,,,


In [66]:
df_2 = df.copy()

In [67]:
df_2['E'] = ['one','one','two', 'three', 'four', 'three']

In [68]:
print df_2

                   A         B         C         D      E
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054    one
2013-01-02 -0.052918  0.488237  1.962247  0.602895    one
2013-01-03  1.170447 -0.017140  0.473393 -0.298883    two
2013-01-04  1.354140 -0.804008 -0.269903  0.608692  three
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499   four
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190  three


In [70]:
df_2[df_2['E'].isin(['two', 'four'])] # use isin to get subset of the data meeting criteria

Unnamed: 0,A,B,C,D,E
2013-01-03,1.170447,-0.01714,0.473393,-0.298883,two
2013-01-05,-0.423381,0.454897,-0.709064,-0.675499,four


## Setting

Setting a new column automatically aligns the data by the indexes

In [83]:
s1 = pd.Series(list(range(1,7)), index=pd.date_range('20130102', periods=6))

In [88]:
print s1 # notice that this series has one date not in df and is missing one of dfs dates

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64


In [85]:
df['F'] = s1

In [89]:
print df # alignment is done by index, and NaN is assigned to missing dates in s1

                   A         B         C         D   F
2013-01-01 -0.191978 -0.823399 -0.305615  0.941054 NaN
2013-01-02 -0.052918  0.488237  1.962247  0.602895   1
2013-01-03  1.170447 -0.017140  0.473393 -0.298883   2
2013-01-04  1.354140 -0.804008 -0.269903  0.608692   3
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499   4
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190   5


Setting values by label

In [90]:
df.at[dates[0],'A'] = 0

Setting values by position

In [91]:
df.iat[0,1] = 0 # 0th row and 1st column

In [92]:
print df

                   A         B         C         D   F
2013-01-01  0.000000  0.000000 -0.305615  0.941054 NaN
2013-01-02 -0.052918  0.488237  1.962247  0.602895   1
2013-01-03  1.170447 -0.017140  0.473393 -0.298883   2
2013-01-04  1.354140 -0.804008 -0.269903  0.608692   3
2013-01-05 -0.423381  0.454897 -0.709064 -0.675499   4
2013-01-06  0.930289 -0.014755 -1.062659 -0.055190   5


Setting values with a numpy array

In [93]:
df.loc[:,'D'] = np.array([5]*len(df))

In [94]:
print df

                   A         B         C  D   F
2013-01-01  0.000000  0.000000 -0.305615  5 NaN
2013-01-02 -0.052918  0.488237  1.962247  5   1
2013-01-03  1.170447 -0.017140  0.473393  5   2
2013-01-04  1.354140 -0.804008 -0.269903  5   3
2013-01-05 -0.423381  0.454897 -0.709064  5   4
2013-01-06  0.930289 -0.014755 -1.062659  5   5


In [95]:
df2 = df.copy() # similar deep copy operation to numpy. A true not referenced copy

In [98]:
df2[df2 > 0] = -df2 # example of a where operation with setting

In [97]:
print df2

                   A         B         C  D   F
2013-01-01  0.000000  0.000000 -0.305615 -5 NaN
2013-01-02 -0.052918 -0.488237 -1.962247 -5  -1
2013-01-03 -1.170447 -0.017140 -0.473393 -5  -2
2013-01-04 -1.354140 -0.804008 -0.269903 -5  -3
2013-01-05 -0.423381 -0.454897 -0.709064 -5  -4
2013-01-06 -0.930289 -0.014755 -1.062659 -5  -5
