# Pandas series

In [1]:
import pandas as pd
import numpy as np

In [2]:
obj = pd.Series([1,2,3,4,5])

In [3]:
obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
obj.values

array([1, 2, 3, 4, 5], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
obj2 = pd.Series([1,2,3,4,5],index=['d','e','a','b','c'])

In [7]:
obj2.values

array([1, 2, 3, 4, 5], dtype=int64)

In [8]:
obj2.index

Index(['d', 'e', 'a', 'b', 'c'], dtype='object')

In [9]:
obj2

d    1
e    2
a    3
b    4
c    5
dtype: int64

In [10]:
obj[obj>2]

2    3
3    4
4    5
dtype: int64

In [11]:
np.array(obj)
obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [12]:
np.exp(obj2)

d      2.718282
e      7.389056
a     20.085537
b     54.598150
c    148.413159
dtype: float64

In [13]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [14]:
obj3 = pd.Series(sdata)

In [15]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [16]:
states = ['cali','Ohio','Texas','Oregon']

In [17]:
obj4 = pd.Series(sdata,index = states)

In [18]:
pd.isnull(obj4)

cali       True
Ohio      False
Texas     False
Oregon    False
dtype: bool

In [19]:
pd.notnull(obj4)

cali      False
Ohio       True
Texas      True
Oregon     True
dtype: bool

In [20]:
obj4.isnull()

cali       True
Ohio      False
Texas     False
Oregon    False
dtype: bool

In [21]:
obj3+obj4

Ohio       70000.0
Oregon     32000.0
Texas     142000.0
Utah           NaN
cali           NaN
dtype: float64

In [22]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [23]:
obj3.name = "population"
obj3.index.name = "states"

In [24]:
print(obj3.name)
print(obj3.index.name)

population
states


In [25]:
obj3.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah'], dtype='object', name='states')

In [26]:
obj3.index = ["state1","state2","state3","state4"]

In [27]:
obj3

state1    35000
state2    71000
state3    16000
state4     5000
Name: population, dtype: int64

# Pandas DataFrame

In [28]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [29]:
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [30]:
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [31]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [32]:
frame[["state","year"]]

Unnamed: 0,state,year
0,Ohio,2000
1,Ohio,2001
2,Ohio,2002
3,Nevada,2001
4,Nevada,2002
5,Nevada,2003


In [33]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [34]:
frame.loc[1]

state    Ohio
year     2001
pop       1.7
Name: 1, dtype: object

In [35]:
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'])


In [36]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [37]:
frame2.debt=[1.6,1.7,1.8,1.9,2.0,3.0]

In [38]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,1.6
1,2001,Ohio,1.7,1.7
2,2002,Ohio,3.6,1.8
3,2001,Nevada,2.4,1.9
4,2002,Nevada,2.9,2.0
5,2003,Nevada,3.2,3.0


In [39]:
frame2["Eastern"]= frame2["state"] == "Ohio"

In [40]:
frame2

Unnamed: 0,year,state,pop,debt,Eastern
0,2000,Ohio,1.5,1.6,True
1,2001,Ohio,1.7,1.7,True
2,2002,Ohio,3.6,1.8,True
3,2001,Nevada,2.4,1.9,False
4,2002,Nevada,2.9,2.0,False
5,2003,Nevada,3.2,3.0,False


In [41]:
del frame2["Eastern"]

In [42]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,1.6
1,2001,Ohio,1.7,1.7
2,2002,Ohio,3.6,1.8
3,2001,Nevada,2.4,1.9
4,2002,Nevada,2.9,2.0
5,2003,Nevada,3.2,3.0


In [43]:
frame2["new"] = [1,2,3,4,5,6]

In [44]:
del frame2["new"]

In [45]:
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,1.6
1,2001,Ohio,1.7,1.7
2,2002,Ohio,3.6,1.8
3,2001,Nevada,2.4,1.9
4,2002,Nevada,2.9,2.0
5,2003,Nevada,3.2,3.0


In [46]:
frame2.index.name = "Record"
frame2.columns.name = "columns"

In [47]:
frame2

columns,year,state,pop,debt
Record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2000,Ohio,1.5,1.6
1,2001,Ohio,1.7,1.7
2,2002,Ohio,3.6,1.8
3,2001,Nevada,2.4,1.9
4,2002,Nevada,2.9,2.0
5,2003,Nevada,3.2,3.0


In [48]:
frame2.values

array([[2000, 'Ohio', 1.5, 1.6],
       [2001, 'Ohio', 1.7, 1.7],
       [2002, 'Ohio', 3.6, 1.8],
       [2001, 'Nevada', 2.4, 1.9],
       [2002, 'Nevada', 2.9, 2.0],
       [2003, 'Nevada', 3.2, 3.0]], dtype=object)

In [49]:
frame2.index

RangeIndex(start=0, stop=6, step=1, name='Record')

In [50]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object', name='columns')

In [51]:
a = pd.Series([1,2,3,4,5],index=['a','d','e','b','c'])

### Reindexing

In [52]:
a

a    1
d    2
e    3
b    4
c    5
dtype: int64

In [53]:
b = a.reindex(['a','b','c','d','e','f'])

In [54]:
b

a    1.0
b    4.0
c    5.0
d    2.0
e    3.0
f    NaN
dtype: float64

In [55]:
frame2 = frame.reindex([3,4,5,0,1,2])
frame2

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [56]:
frame2.iloc[[1,2,3]]

Unnamed: 0,state,year,pop
4,Nevada,2002,2.9
5,Nevada,2003,3.2
0,Ohio,2000,1.5


In [57]:
frame.loc[[1,2,3]]

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4


In [58]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [59]:
obj3 = pd.Series(['blue', 'purple', 'yellow'],index = [0,3,5])

In [60]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2      blue
3    purple
4    purple
5    yellow
dtype: object

In [61]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])

In [62]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [63]:
data.loc[['Ohio'],['three']]

Unnamed: 0,three
Ohio,2


In [64]:
data.drop('one',axis = 1)

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [65]:
data.drop('one',axis = 1,inplace=True)

In [66]:
data

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


### Indexing, Selection, and Filtering

In [67]:
a = pd.Series(np.arange(7),index=['a','b','c','d','e','f','a'])

In [68]:
a

a    0
b    1
c    2
d    3
e    4
f    5
a    6
dtype: int32

In [69]:
a['a']

a    0
a    6
dtype: int32

In [70]:
a[6]

6

In [71]:
a[1:6]

b    1
c    2
d    3
e    4
f    5
dtype: int32

In [72]:
a[['a','b','c']]

a    0
a    6
b    1
c    2
dtype: int32

In [73]:
a[[1,2,3]]

b    1
c    2
d    3
dtype: int32

In [74]:
a[a<2]

a    0
b    1
dtype: int32

In [75]:
a['b':'e'] = 5

In [76]:
a['a'] = 5

In [77]:
a

a    5
b    5
c    5
d    5
e    5
f    5
a    5
dtype: int32

In [78]:
b = pd.DataFrame(np.arange(25).reshape(5,5),index=['r1','r2','r3','r4','r5'],columns=['c1','c2','c3','c4','c5'])

In [79]:
b

Unnamed: 0,c1,c2,c3,c4,c5
r1,0,1,2,3,4
r2,5,6,7,8,9
r3,10,11,12,13,14
r4,15,16,17,18,19
r5,20,21,22,23,24


In [80]:
b.loc['r1':'r3',['c1','c3']]

Unnamed: 0,c1,c3
r1,0,2
r2,5,7
r3,10,12


In [81]:
b.iloc[0:3,0:3]

Unnamed: 0,c1,c2,c3
r1,0,1,2
r2,5,6,7
r3,10,11,12


In [82]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [83]:
data.iloc[:,:3][data['three']>3]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [84]:
ser = pd.Series(np.arange(3.), index=['a','b','c'])

In [85]:
ser

a    0.0
b    1.0
c    2.0
dtype: float64

In [86]:
ser[-1]

2.0

### Arithmetic and Data Alignment

In [87]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a','c','e','f','g'])

In [88]:
print(s1,s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64 a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


In [89]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [90]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])


In [91]:
df1.add(df2 , fill_value=0 )

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [92]:
frame = pd.DataFrame(np.arange(12).reshape(4,3),columns=list('bde'),index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [93]:
series = frame.iloc[0]

In [94]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [95]:
series = frame['d']
series

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int32

In [96]:
frame.sub(series,axis = 1)

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [97]:
f = lambda x :x.max() - x.min()
frame.apply(f,axis = "columns")

Utah      2
Ohio      2
Texas     2
Oregon    2
dtype: int64

In [98]:
frame.add(series,axis=0)

Unnamed: 0,b,d,e
Utah,1,2,3
Ohio,7,8,9
Texas,13,14,15
Oregon,19,20,21


In [99]:
frame = frame * -1.5

In [100]:
frame = np.abs(frame)
frame

Unnamed: 0,b,d,e
Utah,0.0,1.5,3.0
Ohio,4.5,6.0,7.5
Texas,9.0,10.5,12.0
Oregon,13.5,15.0,16.5


In [101]:
fun = lambda x :np.sqrt(x)
frame.applymap(fun)

Unnamed: 0,b,d,e
Utah,0.0,1.224745,1.732051
Ohio,2.12132,2.44949,2.738613
Texas,3.0,3.24037,3.464102
Oregon,3.674235,3.872983,4.062019


### Ranking and Sorting


In [102]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),index=['ashwin', 'ajith'],columns=['d', 'a', 'b', 'c'])

In [103]:
frame

Unnamed: 0,d,a,b,c
ashwin,0,1,2,3
ajith,4,5,6,7


In [104]:
frame.sort_index(axis=1,ascending=False)

Unnamed: 0,d,c,b,a
ashwin,0,3,2,1
ajith,4,7,6,5


In [105]:
data = {'a':[1,2,3,4,5],'b':[6,7,8,9,0]}

In [106]:
df = pd.DataFrame(data)

In [107]:
df

Unnamed: 0,a,b
0,1,6
1,2,7
2,3,8
3,4,9
4,5,0


In [108]:
df.sort_values(by = ['b','a'])

Unnamed: 0,a,b
4,5,0
0,1,6
1,2,7
2,3,8
3,4,9


In [109]:
se = pd.Series([1,2,3,6,6,88,88])
se

0     1
1     2
2     3
3     6
4     6
5    88
6    88
dtype: int64

In [110]:
se.rank(method='first')

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
dtype: float64

In [111]:
frame3 = pd.DataFrame(np.random.randint(0,50,30).reshape(6,5))

In [112]:
frame3

Unnamed: 0,0,1,2,3,4
0,18,7,21,29,3
1,15,12,36,17,32
2,42,23,28,14,23
3,25,32,17,31,15
4,25,1,32,34,43
5,8,32,11,6,24


In [113]:
frame3.sort_index(ascending=False , axis= 1)

Unnamed: 0,4,3,2,1,0
0,3,29,21,7,18
1,32,17,36,12,15
2,23,14,28,23,42
3,15,31,17,32,25
4,43,34,32,1,25
5,24,6,11,32,8


In [114]:
frame3.rank(axis = 1)

Unnamed: 0,0,1,2,3,4
0,3.0,2.0,4.0,5.0,1.0
1,2.0,1.0,5.0,3.0,4.0
2,5.0,2.5,4.0,1.0,2.5
3,3.0,5.0,2.0,4.0,1.0
4,2.0,1.0,3.0,4.0,5.0
5,2.0,5.0,3.0,1.0,4.0


In [115]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],index=['a', 'b', 'c', 'd'],columns=['one', 'two'])

In [116]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [117]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [118]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [119]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [120]:
df.idxmin()

one    d
two    b
dtype: object

In [121]:
df.pct_change(axis = 1)

Unnamed: 0,one,two
a,,0.0
b,,-1.633803
c,,
d,,-2.733333


In [124]:
frame3

Unnamed: 0,0,1,2,3,4
0,18,7,21,29,3
1,15,12,36,17,32
2,42,23,28,14,23
3,25,32,17,31,15
4,25,1,32,34,43
5,8,32,11,6,24


In [126]:
frame3.corr()

Unnamed: 0,0,1,2,3,4
0,1.0,-0.024608,0.348572,0.207003,0.048268
1,-0.024608,1.0,-0.69719,-0.532581,-0.300802
2,0.348572,-0.69719,1.0,0.24265,0.567784
3,0.207003,-0.532581,0.24265,1.0,-0.04271
4,0.048268,-0.300802,0.567784,-0.04271,1.0


In [127]:
frame3.cov()

Unnamed: 0,0,1,2,3,4
0,135.766667,-3.766667,38.566667,26.833333,7.733333
1,-3.766667,172.566667,-86.966667,-77.833333,-54.333333
2,38.566667,-86.966667,90.166667,25.633333,74.133333
3,26.833333,-77.833333,25.633333,123.766667,-6.533333
4,7.733333,-54.333333,74.133333,-6.533333,189.066667


In [131]:
frame3.cov()

Unnamed: 0,0,1,2,3,4
0,135.766667,-3.766667,38.566667,26.833333,7.733333
1,-3.766667,172.566667,-86.966667,-77.833333,-54.333333
2,38.566667,-86.966667,90.166667,25.633333,74.133333
3,26.833333,-77.833333,25.633333,123.766667,-6.533333
4,7.733333,-54.333333,74.133333,-6.533333,189.066667


In [139]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a','d','o'])

In [140]:
unique_vals = pd.Series(['c', 'b', 'a'])

In [141]:
pd.Index(unique_vals).get_indexer(to_match)

array([ 0,  2,  1,  1,  0,  2, -1, -1], dtype=int64)