# This notebook gives a basic introduction to Pandas

In [1]:
import numpy as np
import pandas as pd

## pandas.Series

In [2]:
s = pd.Series([12, -4, 7, 9])
s

0    12
1    -4
2     7
3     9
dtype: int64

In [3]:
s = pd.Series([12, -4, 7, 9], index=['a', 'b', 'c', 'd'])
s

a    12
b    -4
c     7
d     9
dtype: int64

In [4]:
s.values

array([12, -4,  7,  9])

In [5]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:
s[2]

7

In [7]:
s['b']

-4

In [8]:
s[0:2]

a    12
b    -4
dtype: int64

In [9]:
s[['b', 'c']]

b   -4
c    7
dtype: int64

In [10]:
s[1] = 0
s

a    12
b     0
c     7
d     9
dtype: int64

In [11]:
s['b'] = 1
s

a    12
b     1
c     7
d     9
dtype: int64

In [12]:
arr = np.array([1, 2, 3, 4])
s3 = pd.Series(arr)
s3

0    1
1    2
2    3
3    4
dtype: int64

In [13]:
s4 = pd.Series(s)
s4

a    12
b     1
c     7
d     9
dtype: int64

In [14]:
s3

0    1
1    2
2    3
3    4
dtype: int64

In [15]:
arr[2] = -2
s3

0    1
1    2
2   -2
3    4
dtype: int64

In [16]:
s[s > 8]

a    12
d     9
dtype: int64

In [17]:
s / 2

a    6.0
b    0.5
c    3.5
d    4.5
dtype: float64

In [18]:
np.log(s)

a    2.484907
b    0.000000
c    1.945910
d    2.197225
dtype: float64

In [19]:
serd = pd.Series([1, 0, 2, 1, 2, 3], index=['white', 'white', 'blue', 'green', 'green', 'yellow'])
serd

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [20]:
serd.unique()

array([1, 0, 2, 3])

In [21]:
serd.value_counts()

2    2
1    2
3    1
0    1
dtype: int64

In [22]:
serd.isin([0, 3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [23]:
serd[serd.isin([0, 3])]

white     0
yellow    3
dtype: int64

In [24]:
s2 = pd.Series([5, -3, np.NaN, 14])
s2

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [25]:
s2.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [26]:
s2.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [27]:
s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [28]:
s2[s2.isnull()]

2   NaN
dtype: float64

In [29]:
mydict = {'red': 2000, "blue": 1000, "yellow": 500, "orange": 1000}
myseries = pd.Series(mydict)
myseries

blue      1000
orange    1000
red       2000
yellow     500
dtype: int64

In [30]:
colors = ['red', 'yellow', 'orange', 'blue', 'green']
myseries = pd.Series(mydict, index=colors)
myseries

red       2000.0
yellow     500.0
orange    1000.0
blue      1000.0
green        NaN
dtype: float64

In [31]:
mydict2 = {"red": 400, "yellow": 1000, "black": 700}
myseries2 = pd.Series(mydict2)
myseries + myseries2

black        NaN
blue         NaN
green        NaN
orange       NaN
red       2400.0
yellow    1500.0
dtype: float64

##  Pandas.DataFrame

In [32]:
data = {'colors': ['blue', 'green', 'yellow', 'red', 'white'],
        'object': ['ball', 'pen', 'pencil', 'paper', 'mug'],
        'price': [1.2, 1.0, 0.6, 0.9, 1.7]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,colors,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [33]:
frame2 = pd.DataFrame(data, columns=['object', 'price'])
frame2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


In [34]:
frame2 = pd.DataFrame(data, index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,colors,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [35]:
frame3 = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['red', 'blue', 'yellow', 'white'],
                     columns=['ball', 'pen', 'pencil', 'paper'])
frame3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [36]:
frame.columns

Index(['colors', 'object', 'price'], dtype='object')

In [37]:
frame.index

RangeIndex(start=0, stop=5, step=1)

In [38]:
frame.values

array([['blue', 'ball', 1.2],
       ['green', 'pen', 1.0],
       ['yellow', 'pencil', 0.6],
       ['red', 'paper', 0.9],
       ['white', 'mug', 1.7]], dtype=object)

In [39]:
frame['price']

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [40]:
frame.price

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [41]:
frame.ix[2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


colors    yellow
object    pencil
price        0.6
Name: 2, dtype: object

In [42]:
frame.ix[[2, 4]]

Unnamed: 0,colors,object,price
2,yellow,pencil,0.6
4,white,mug,1.7


In [43]:
frame[0:1]

Unnamed: 0,colors,object,price
0,blue,ball,1.2


In [44]:
frame[1:3]

Unnamed: 0,colors,object,price
1,green,pen,1.0
2,yellow,pencil,0.6


In [45]:
frame['object'][3]

'paper'

In [46]:
frame.index.name = 'id'
frame.columns.name = 'item'
frame

item,colors,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [47]:
frame['new'] = 12
frame

item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,12
1,green,pen,1.0,12
2,yellow,pencil,0.6,12
3,red,paper,0.9,12
4,white,mug,1.7,12


In [48]:
frame['new'] = [3.0, 1.3, 2.2, 0.8, 1.1]
frame

item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,3.0
1,green,pen,1.0,1.3
2,yellow,pencil,0.6,2.2
3,red,paper,0.9,0.8
4,white,mug,1.7,1.1


In [49]:
ser = pd.Series(np.arange(5))
ser

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [50]:
frame['new'] = ser
frame

item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,0.6,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [51]:
frame['price'][2] = 4.4
frame

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,4.4,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [52]:
frame.loc[2, 'price'] = 3.3
frame

item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,3.3,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [53]:
frame.isin([1.0, 'pen'])

item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,False,True,True,True
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [54]:
frame[frame.isin([1.0, 'pen'])]

item,colors,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,,pen,1.0,1.0
2,,,,
3,,,,
4,,,,


In [55]:
del frame['new']
frame

item,colors,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,3.3
3,red,paper,0.9
4,white,mug,1.7


In [56]:
frame3[frame3 < 12]

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,2.0,3.0
blue,4.0,5.0,6.0,7.0
yellow,8.0,9.0,10.0,11.0
white,,,,


In [57]:
nesdict = {'red': {2012: 22, 2013: 33},
           'white': {2011: 13, 2012: 22, 2013: 18},
           'blue': {2011: 17, 2012: 27, 2013: 18}}
frame2 = pd.DataFrame(nesdict)
frame2

Unnamed: 0,blue,red,white
2011,17,,13
2012,27,22.0,22
2013,18,33.0,18


In [58]:
frame2.T

Unnamed: 0,2011,2012,2013
blue,17.0,27.0,18.0
red,,22.0,33.0
white,13.0,22.0,18.0


In [59]:
ser = pd.Series([5, 0, 3, 8, 4], index=['red', 'blue', 'yellow', 'white', 'green'])
ser.index

Index(['red', 'blue', 'yellow', 'white', 'green'], dtype='object')

In [60]:
ser.idxmin()

'blue'

In [61]:
ser.idxmax()

'white'

In [62]:
serd = pd.Series(range(6), index=['white', 'white', 'blue', 'green', 'green', 'yellow'])
serd

white     0
white     1
blue      2
green     3
green     4
yellow    5
dtype: int64

In [63]:
serd['white']

white    0
white    1
dtype: int64

In [64]:
serd.index.is_unique    

False

In [65]:
frame.index.is_unique

True

In [66]:
ser = pd.Series([2, 5, 7, 4], index=['one', 'two', 'three', 'four'])
ser

one      2
two      5
three    7
four     4
dtype: int64

In [67]:
ser.reindex(['three', 'four', 'five', 'one'])

three    7.0
four     4.0
five     NaN
one      2.0
dtype: float64

In [68]:
ser3 = pd.Series([1, 5, 6, 3], index=[0, 3, 5, 6])
ser3

0    1
3    5
5    6
6    3
dtype: int64

In [69]:
ser3.reindex(range(6), method='ffill')

0    1
1    1
2    1
3    5
4    5
5    6
dtype: int64

In [70]:
ser3.reindex(range(6), method='bfill')

0    1
1    5
2    5
3    5
4    6
5    6
dtype: int64

In [71]:
frame.reindex(range(5), method='ffill', columns=['colors', 'price', 'new', 'object'])

item,colors,price,new,object
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,1.2,blue,ball
1,green,1.0,green,pen
2,yellow,3.3,yellow,pencil
3,red,0.9,red,paper
4,white,1.7,white,mug


In [3]:
import pandas as pd
import numpy as np
ser = pd.Series(np.arange(4.), index=['red', 'blue', 'yellow', 'white'])
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [4]:
ser.drop('yellow')

red      0.0
blue     1.0
white    3.0
dtype: float64

In [5]:
ser.drop(['blue', 'white'])

red       0.0
yellow    2.0
dtype: float64

In [8]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['red', 'blue', 'yellow', 'white'],
                    columns=['ball', 'pen', 'pencil', 'paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [9]:
frame.drop(['blue', 'yellow'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
white,12,13,14,15


In [10]:
frame.drop(['pen', 'pencil'], axis=1)

Unnamed: 0,ball,paper
red,0,3
blue,4,7
yellow,8,11
white,12,15


In [11]:
s1 = pd.Series([3, 2, 5, 1], ['white', 'yellow', 'green', 'blue'])
s2 = pd.Series([1, 4, 7, 2, 1], ['white', 'yellow', 'black', 'blue', 'brown'])
s1 + s2

black     NaN
blue      3.0
brown     NaN
green     NaN
white     4.0
yellow    6.0
dtype: float64

In [13]:
frame1 = pd.DataFrame(np.arange(16).reshape((4,  4)),
                     index=['red', 'blue', 'yellow', 'white'],
                     columns=['ball', 'pen', 'pencil', 'paper'])

In [16]:
frame2 = pd.DataFrame(np.arange(12).reshape((4, 3)),
                      index=['blue', 'green', 'white', 'yellow'],
                      columns=['mug', 'pen', 'ball'])

In [17]:
frame1

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [18]:
frame2

Unnamed: 0,mug,pen,ball
blue,0,1,2
green,3,4,5
white,6,7,8
yellow,9,10,11


In [19]:
frame1 + frame2

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [20]:
frame.add(frame2)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [22]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['red', 'blue', 'yellow', 'white'],
                    columns=['ball', 'pen', 'pencil', 'paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [24]:
ser = pd.Series(np.arange(4), index=['ball', 'pen', 'pencil', 'paper'])

ser

ball      0
pen       1
pencil    2
paper     3
dtype: int32

In [25]:
frame - ser

Unnamed: 0,ball,pen,pencil,paper
red,0,0,0,0
blue,4,4,4,4
yellow,8,8,8,8
white,12,12,12,12


In [26]:
ser['mug'] = 9
ser

ball      0
pen       1
pencil    2
paper     3
mug       9
dtype: int64

In [28]:
frame - ser

Unnamed: 0,ball,mug,paper,pen,pencil
red,0,,0,0,0
blue,4,,4,4,4
yellow,8,,8,8,8
white,12,,12,12,12


In [29]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['red', 'blue', 'yellow', 'white'],
                    columns=['ball', 'pen', 'pencil', 'paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [30]:
np.sqrt(frame)

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [35]:
# f = lambda x: x.max() - x.min()
def f(x):
    return x.max() - x.min()
frame.apply(f)

ball      12
pen       12
pencil    12
paper     12
dtype: int64

In [32]:
frame.apply(f, axis=1)

red       3
blue      3
yellow    3
white     3
dtype: int64

In [36]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,ball,pen,pencil,paper
min,0,1,2,3
max,12,13,14,15


In [37]:
frame.sum()

ball      24
pen       28
pencil    32
paper     36
dtype: int64

In [38]:
frame.mean()

ball      6.0
pen       7.0
pencil    8.0
paper     9.0
dtype: float64

In [39]:
frame.describe()

Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


In [40]:
ser = pd.Series([5, 0, 3, 8, 4], index=['red', 'blue', 'yellow', 'white', 'green'])
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [41]:
ser.sort_index()

blue      0
green     4
red       5
white     8
yellow    3
dtype: int64

In [42]:
ser.sort_index(ascending=False)

yellow    3
white     8
red       5
green     4
blue      0
dtype: int64

In [43]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['red', 'blue', 'yellow', 'white'],
                    columns=['ball', 'pen', 'pencil', 'paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [44]:
frame.sort_index()

Unnamed: 0,ball,pen,pencil,paper
blue,4,5,6,7
red,0,1,2,3
white,12,13,14,15
yellow,8,9,10,11


In [45]:
frame.sort_index(axis=1)

Unnamed: 0,ball,paper,pen,pencil
red,0,3,1,2
blue,4,7,5,6
yellow,8,11,9,10
white,12,15,13,14


In [47]:
frame.sort_index(by='pen')

  """Entry point for launching an IPython kernel.


Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [48]:
frame.sort_values(by='pen')

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [56]:
frame.sort_values(by=['pen', 'pencil'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [64]:
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [57]:
ser.rank()

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [58]:
ser.rank(method='first')

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [59]:
ser.rank(ascending=False)

red       2.0
blue      5.0
yellow    4.0
white     1.0
green     3.0
dtype: float64

In [60]:
seq2 = pd.Series([3, 4, 3, 4, 5, 4, 3, 2],
                ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013'])

In [61]:
seq = pd.Series([1, 2, 3, 4, 4, 3, 2, 1],
                ['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013'])

In [62]:
seq.corr(seq2)

0.7745966692414835

In [63]:
seq.cov(seq2)

0.8571428571428571