In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.index = ['a', 'b', 'c', 'd']

In [4]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [5]:
data['b']

0.5

In [6]:
population_dict = {'California': 38332521, "Texas": 26448193, "New York": 19651127, 'Florida':19552860, 'Illinois': 12882135}

In [7]:
population = pd.Series(population_dict) 
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [8]:
population['California']

38332521

In [9]:
population['California':'Illinois']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [10]:
d = {'col1':[1,2], 'col2':[3,4]}

In [11]:
df = pd.DataFrame(data=d)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [12]:
df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5,5)), columns = ['a', 'b', 'c', 'd', 'e'])
df2

Unnamed: 0,a,b,c,d,e
0,5,8,2,1,9
1,3,6,9,7,4
2,8,1,6,6,7
3,3,2,8,3,6
4,7,0,3,9,6


In [13]:
area_dict={'California':423967, 'Texas':695662, 'New York':141297, 'Florida':170312, 'Illinois':149995}

In [14]:
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [15]:
states = pd.DataFrame({'population':population, 'area':area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [16]:
dates= pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.023166,1.191846,0.087222,-0.642594
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954
2013-01-03,0.215869,1.062277,-0.275198,0.211816
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174
2013-01-06,1.470501,-0.630076,1.19037,0.878303


In [18]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-1.023166,1.191846,0.087222,-0.642594
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954
2013-01-03,0.215869,1.062277,-0.275198,0.211816
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174


In [19]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174
2013-01-06,1.470501,-0.630076,1.19037,0.878303


In [20]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
df.values

array([[-1.02316563,  1.19184638,  0.08722244, -0.6425945 ],
       [-1.12681436, -0.69270341, -1.06851616, -0.17595437],
       [ 0.21586869,  1.06227745, -0.2751983 ,  0.21181633],
       [-1.57995976, -0.3151623 , -0.0530914 ,  0.83580226],
       [-0.26244212,  0.13268695, -1.21512235, -1.99017391],
       [ 1.47050143, -0.630076  ,  1.19037009,  0.87830319]])

In [23]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.384335,0.124812,-0.222389,-0.147134
std,1.11395,0.83057,0.874603,1.07607
min,-1.57996,-0.692703,-1.215122,-1.990174
25%,-1.100902,-0.551348,-0.870187,-0.525934
50%,-0.642804,-0.091238,-0.164145,0.017931
75%,0.096291,0.82988,0.052144,0.679806
max,1.470501,1.191846,1.19037,0.878303


In [24]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.642594,0.087222,1.191846,-1.023166
2013-01-02,-0.175954,-1.068516,-0.692703,-1.126814
2013-01-03,0.211816,-0.275198,1.062277,0.215869
2013-01-04,0.835802,-0.053091,-0.315162,-1.57996
2013-01-05,-1.990174,-1.215122,0.132687,-0.262442
2013-01-06,0.878303,1.19037,-0.630076,1.470501


In [25]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954
2013-01-06,1.470501,-0.630076,1.19037,0.878303
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174
2013-01-03,0.215869,1.062277,-0.275198,0.211816
2013-01-01,-1.023166,1.191846,0.087222,-0.642594


In [26]:
df.loc[dates[0]]

A   -1.023166
B    1.191846
C    0.087222
D   -0.642594
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.023166,1.191846
2013-01-02,-1.126814,-0.692703
2013-01-03,0.215869,1.062277
2013-01-04,-1.57996,-0.315162
2013-01-05,-0.262442,0.132687
2013-01-06,1.470501,-0.630076


In [28]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-1.126814,-0.692703
2013-01-03,0.215869,1.062277
2013-01-04,-1.57996,-0.315162


In [29]:
df.iloc[3]

A   -1.579960
B   -0.315162
C   -0.053091
D    0.835802
Name: 2013-01-04 00:00:00, dtype: float64

In [30]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.57996,-0.315162
2013-01-05,-0.262442,0.132687


In [31]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,1.191846,0.087222,
2013-01-02,,,,
2013-01-03,0.215869,1.062277,,0.211816
2013-01-04,,,,0.835802
2013-01-05,,0.132687,,
2013-01-06,1.470501,,1.19037,0.878303


In [32]:
df2 = df.copy()

In [33]:
df2['E']= ['one', 'one', 'two', 'three', 'four', 'three']
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.023166,1.191846,0.087222,-0.642594,one
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954,one
2013-01-03,0.215869,1.062277,-0.275198,0.211816,two
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802,three
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174,four
2013-01-06,1.470501,-0.630076,1.19037,0.878303,three


In [34]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.215869,1.062277,-0.275198,0.211816,two
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174,four


In [35]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [36]:
df['F']=s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.023166,1.191846,0.087222,-0.642594,
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954,1.0
2013-01-03,0.215869,1.062277,-0.275198,0.211816,2.0
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802,3.0
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174,4.0
2013-01-06,1.470501,-0.630076,1.19037,0.878303,5.0


In [37]:
df.at[dates[0], 'A']=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,1.191846,0.087222,-0.642594,
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954,1.0
2013-01-03,0.215869,1.062277,-0.275198,0.211816,2.0
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802,3.0
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174,4.0
2013-01-06,1.470501,-0.630076,1.19037,0.878303,5.0


In [38]:
df.iat[0,1]=0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.087222,-0.642594,
2013-01-02,-1.126814,-0.692703,-1.068516,-0.175954,1.0
2013-01-03,0.215869,1.062277,-0.275198,0.211816,2.0
2013-01-04,-1.57996,-0.315162,-0.053091,0.835802,3.0
2013-01-05,-0.262442,0.132687,-1.215122,-1.990174,4.0
2013-01-06,1.470501,-0.630076,1.19037,0.878303,5.0


In [39]:
df.loc[:, 'D']=np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.087222,5,
2013-01-02,-1.126814,-0.692703,-1.068516,5,1.0
2013-01-03,0.215869,1.062277,-0.275198,5,2.0
2013-01-04,-1.57996,-0.315162,-0.053091,5,3.0
2013-01-05,-0.262442,0.132687,-1.215122,5,4.0
2013-01-06,1.470501,-0.630076,1.19037,5,5.0


In [40]:
df.mean()

A   -0.213808
B   -0.073830
C   -0.222389
D    5.000000
F    3.000000
dtype: float64

In [41]:
df.mean(axis=1)

2013-01-01    1.271806
2013-01-02    0.622393
2013-01-03    1.600590
2013-01-04    1.210357
2013-01-05    1.531024
2013-01-06    2.406159
Freq: D, dtype: float64

In [42]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.087222,5,
2013-01-02,-1.126814,-0.692703,-0.981294,10,1.0
2013-01-03,-0.910946,0.369574,-1.256492,15,3.0
2013-01-04,-2.490905,0.054412,-1.309583,20,6.0
2013-01-05,-2.753348,0.187099,-2.524706,25,10.0
2013-01-06,-1.282846,-0.442977,-1.334336,30,15.0


In [43]:
df.apply(lambda x: x.max()-x.min())

A    3.050461
B    1.754981
C    2.405492
D    0.000000
F    4.000000
dtype: float64

In [44]:
df['A'].value_counts()

 0.215869    1
-1.579960    1
 1.470501    1
-0.262442    1
-1.126814    1
 0.000000    1
Name: A, dtype: int64

In [45]:
left = pd.DataFrame({'key':['foo','bar'], 'lval':[1,2]})

In [46]:
right = pd.DataFrame({'key':['foo', 'bar'], 'rval':[4,5]})

In [47]:
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [48]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [49]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [50]:
df = pd.DataFrame(np.random.randn(8,4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.491524,-0.731737,0.547082,-1.086422
1,-0.212095,0.255316,-0.113716,-0.307371
2,1.737126,1.572913,-0.961664,-2.11472
3,0.151417,-0.262103,-0.586894,-1.020835
4,-0.048012,0.637592,-0.052785,-0.38023
5,1.981229,1.402361,0.48816,-1.297031
6,0.109432,1.482135,-1.80693,1.532583
7,1.22393,0.661127,1.971204,-0.86419


In [51]:
s = df.iloc[3]

In [52]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.491524,-0.731737,0.547082,-1.086422
1,-0.212095,0.255316,-0.113716,-0.307371
2,1.737126,1.572913,-0.961664,-2.11472
3,0.151417,-0.262103,-0.586894,-1.020835
4,-0.048012,0.637592,-0.052785,-0.38023
5,1.981229,1.402361,0.48816,-1.297031
6,0.109432,1.482135,-1.80693,1.532583
7,1.22393,0.661127,1.971204,-0.86419
8,0.151417,-0.262103,-0.586894,-1.020835


In [53]:
df = pd.DataFrame({'A':['foo','bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar'], 
                'B':['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 
                   'C':np.random.randn(8), 
                   'D':np.random.randn(8)})

In [54]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.703943,-1.231377
1,bar,one,-0.048658,-0.458262
2,foo,two,-0.550848,0.823262
3,bar,three,-1.267292,-0.149166
4,foo,two,-0.15014,0.907801
5,bar,two,-2.08215,0.936584
6,foo,one,0.214885,0.327745
7,bar,three,0.250993,-0.363159


In [55]:
df.groupby('A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001DDC23BF668>

In [56]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-3.147107,-0.034003
foo,-1.190046,0.827432


In [57]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.048658,-0.458262
bar,three,-1.016299,-0.512325
bar,two,-2.08215,0.936584
foo,one,-0.489058,-0.903631
foo,two,-0.700988,1.731064
