In [37]:
import pandas as pd
import numpy as np

## Tutorial de pandas

### Object Creation

Creating dataframe by passing a numpy array

In [39]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [40]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.194805,-0.101566,0.458155,-0.205729
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-03,-1.170474,0.160079,-0.072855,1.036738
2013-01-04,0.323912,-0.092184,-0.132386,0.425272
2013-01-05,0.935816,-0.766005,-0.242956,-0.129967
2013-01-06,0.132909,0.823678,1.033614,1.402489


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [41]:
df2 = pd.DataFrame({'A': 1.,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### View Data

View top and bottom rows of the frame.

In [42]:
df2.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [43]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-03,-1.170474,0.160079,-0.072855,1.036738
2013-01-04,0.323912,-0.092184,-0.132386,0.425272
2013-01-05,0.935816,-0.766005,-0.242956,-0.129967
2013-01-06,0.132909,0.823678,1.033614,1.402489


View index and columns

In [44]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [45]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

Statistics of your data

In [46]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.199946,0.064811,0.273484,0.431783
std,0.841517,0.53246,0.503621,0.658344
min,-1.170474,-0.766005,-0.242956,-0.205729
25%,-0.112877,-0.099221,-0.117504,-0.082001
50%,0.22841,0.033948,0.19265,0.243583
75%,0.78284,0.313668,0.562539,0.883871
max,1.172319,0.823678,1.033614,1.402489


Transpose your data

In [47]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.194805,1.172319,-1.170474,0.323912,0.935816,0.132909
B,-0.101566,0.364864,0.160079,-0.092184,-0.766005,0.823678
C,0.458155,0.597333,-0.072855,-0.132386,-0.242956,1.033614
D,-0.205729,0.061895,1.036738,0.425272,-0.129967,1.402489


Sorting by an axis

In [48]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.205729,0.458155,-0.101566,-0.194805
2013-01-02,0.061895,0.597333,0.364864,1.172319
2013-01-03,1.036738,-0.072855,0.160079,-1.170474
2013-01-04,0.425272,-0.132386,-0.092184,0.323912
2013-01-05,-0.129967,-0.242956,-0.766005,0.935816
2013-01-06,1.402489,1.033614,0.823678,0.132909


Sorting by values

In [49]:
df.sort_values(by='C')

Unnamed: 0,A,B,C,D
2013-01-05,0.935816,-0.766005,-0.242956,-0.129967
2013-01-04,0.323912,-0.092184,-0.132386,0.425272
2013-01-03,-1.170474,0.160079,-0.072855,1.036738
2013-01-01,-0.194805,-0.101566,0.458155,-0.205729
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-06,0.132909,0.823678,1.033614,1.402489


### Selection

In [50]:
df['A']

2013-01-01   -0.194805
2013-01-02    1.172319
2013-01-03   -1.170474
2013-01-04    0.323912
2013-01-05    0.935816
2013-01-06    0.132909
Freq: D, Name: A, dtype: float64

Selecting via [], which slices the rows.

In [51]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.194805,-0.101566,0.458155,-0.205729
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-03,-1.170474,0.160079,-0.072855,1.036738


In [52]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-03,-1.170474,0.160079,-0.072855,1.036738
2013-01-04,0.323912,-0.092184,-0.132386,0.425272


### Selection by label

Selecting on a multi-axis by label:

In [53]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.194805,-0.101566
2013-01-02,1.172319,0.364864
2013-01-03,-1.170474,0.160079
2013-01-04,0.323912,-0.092184
2013-01-05,0.935816,-0.766005
2013-01-06,0.132909,0.823678


In [54]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,1.172319,0.364864
2013-01-03,-1.170474,0.160079
2013-01-04,0.323912,-0.092184


For getting a scalar value:

In [55]:
df.loc[dates[0], 'A']

-0.1948054526800816

Selection by position

In [56]:
df.iloc[3]

A    0.323912
B   -0.092184
C   -0.132386
D    0.425272
Name: 2013-01-04 00:00:00, dtype: float64

By integer slices, acting similar to numpy/python:

In [57]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.323912,-0.092184
2013-01-05,0.935816,-0.766005


In [58]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,1.172319,0.597333
2013-01-03,-1.170474,-0.072855
2013-01-05,0.935816,-0.242956


For slicing rows explicitly:

In [59]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-03,-1.170474,0.160079,-0.072855,1.036738


For slicing columns explicitly:

In [60]:
 df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.101566,0.458155
2013-01-02,0.364864,0.597333
2013-01-03,0.160079,-0.072855
2013-01-04,-0.092184,-0.132386
2013-01-05,-0.766005,-0.242956
2013-01-06,0.823678,1.033614


### Boolean indexing

Using a single column’s values to select data.

In [61]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,1.172319,0.364864,0.597333,0.061895
2013-01-04,0.323912,-0.092184,-0.132386,0.425272
2013-01-05,0.935816,-0.766005,-0.242956,-0.129967
2013-01-06,0.132909,0.823678,1.033614,1.402489


Using the isin() method for filtering:

In [62]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.194805,-0.101566,0.458155,-0.205729,one
2013-01-02,1.172319,0.364864,0.597333,0.061895,one
2013-01-03,-1.170474,0.160079,-0.072855,1.036738,two
2013-01-04,0.323912,-0.092184,-0.132386,0.425272,three
2013-01-05,0.935816,-0.766005,-0.242956,-0.129967,four
2013-01-06,0.132909,0.823678,1.033614,1.402489,three


In [63]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.170474,0.160079,-0.072855,1.036738,two
2013-01-05,0.935816,-0.766005,-0.242956,-0.129967,four


### Setting

Setting values by label:

In [64]:
df.at[dates[0], 'A'] = 0

Setting values by position:

In [65]:
df.iat[0, 1] = 0

Setting by assigning with a NumPy array:

In [66]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.458155,5
2013-01-02,1.172319,0.364864,0.597333,5
2013-01-03,-1.170474,0.160079,-0.072855,5
2013-01-04,0.323912,-0.092184,-0.132386,5
2013-01-05,0.935816,-0.766005,-0.242956,5
2013-01-06,0.132909,0.823678,1.033614,5


A where operation with setting.

In [36]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.665441,-5
2013-01-02,-0.130102,-0.300801,-0.260372,-5
2013-01-03,-0.697951,-0.553108,-0.234588,-5
2013-01-04,-1.123217,-0.506176,-0.326812,-5
2013-01-05,-1.231492,-0.087117,-0.626522,-5
2013-01-06,-1.237526,-2.344292,-0.399688,-5
