## Pandas

Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

1. Series


2. DataFrame


3. Missing Data


4. GroupBy


5. Merging, Joining, Concatenating


6. Operations


7. Data Input and Output

#### Pandas Series

Series is similar to Numpy array. Difference between a series and a numpy array is that a series can have axis labels.

In [1]:
import numpy as np
import pandas as pd

### Create Series using numpy arrays, list, or dictionary

In [2]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([100,200,300])
d = {'a':10,'b':20,'c':30}

##### Using lists

In [3]:
pd.Series(data=my_list)

0    10
1    20
2    30
dtype: int64

In [4]:
pd.Series(my_list,labels)

a    10
b    20
c    30
dtype: int64

##### Using numpy arrays

In [5]:
pd.Series(arr,labels)

a    100
b    200
c    300
dtype: int64

##### Using dictionary

In [6]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

##### Type of data

In [7]:
pd.Series([sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

##### Using index

In [8]:
ser1 = pd.Series([1,2,3,4],index=['Sushil','Gaurav','Bhagyalaxmi','Ankita'])
ser2 = pd.Series([21,22,23,24],index=['Sushil','Gaurav','Bhagyalaxmi','Viplav'])

In [9]:
ser1

Sushil         1
Gaurav         2
Bhagyalaxmi    3
Ankita         4
dtype: int64

In [10]:
ser2

Sushil         21
Gaurav         22
Bhagyalaxmi    23
Viplav         24
dtype: int64

In [11]:
ser1+ser2

Ankita          NaN
Bhagyalaxmi    26.0
Gaurav         24.0
Sushil         22.0
Viplav          NaN
dtype: float64

In [12]:
ser1['Gaurav']

2

#### Dataframes

In [13]:
!git add --all

In [14]:
!git commit -m "pandas"

[master d71d798] pandas
 1 file changed, 12 insertions(+), 10 deletions(-)


### Dataframes

#### Dataframe is a workhorse of pandas and is directly inspired by R.

In [15]:
df = pd.DataFrame(np.random.randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())

In [17]:
df

Unnamed: 0,W,X,Y,Z
A,-0.476757,-0.241681,-0.727437,0.484378
B,1.143756,-0.025936,-0.628773,0.834933
C,0.49109,-0.398178,0.737334,0.092209
D,1.782505,1.620772,0.399676,-0.16566
E,1.414397,-0.380957,-0.221823,-0.987242


In [18]:
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

##### Selection and Indexing

In [25]:
df['W'] # Recommended syntax

A   -0.476757
B    1.143756
C    0.491090
D    1.782505
E    1.414397
Name: W, dtype: float64

In [24]:
df.W # SQL Syntax - Not recommended

A   -0.476757
B    1.143756
C    0.491090
D    1.782505
E    1.414397
Name: W, dtype: float64

In [21]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-0.476757,0.484378
B,1.143756,0.834933
C,0.49109,0.092209
D,1.782505,-0.16566
E,1.414397,-0.987242


In [26]:
type(df['W'])

pandas.core.series.Series

In [38]:
df['sum']=df['W']+df['X']

In [29]:
df

Unnamed: 0,W,X,Y,Z,sum
A,-0.476757,-0.241681,-0.727437,0.484378,-0.718438
B,1.143756,-0.025936,-0.628773,0.834933,1.11782
C,0.49109,-0.398178,0.737334,0.092209,0.092912
D,1.782505,1.620772,0.399676,-0.16566,3.403277
E,1.414397,-0.380957,-0.221823,-0.987242,1.03344


In [30]:
df['sum']

A   -0.718438
B    1.117820
C    0.092912
D    3.403277
E    1.033440
Name: sum, dtype: float64

In [31]:
df.sum

<bound method DataFrame.sum of           W         X         Y         Z       sum
A -0.476757 -0.241681 -0.727437  0.484378 -0.718438
B  1.143756 -0.025936 -0.628773  0.834933  1.117820
C  0.491090 -0.398178  0.737334  0.092209  0.092912
D  1.782505  1.620772  0.399676 -0.165660  3.403277
E  1.414397 -0.380957 -0.221823 -0.987242  1.033440>

#### Remove colums

In [33]:
df.drop('sum',axis=1)

Unnamed: 0,W,X,Y,Z
A,-0.476757,-0.241681,-0.727437,0.484378
B,1.143756,-0.025936,-0.628773,0.834933
C,0.49109,-0.398178,0.737334,0.092209
D,1.782505,1.620772,0.399676,-0.16566
E,1.414397,-0.380957,-0.221823,-0.987242


In [34]:
df

Unnamed: 0,W,X,Y,Z,sum
A,-0.476757,-0.241681,-0.727437,0.484378,-0.718438
B,1.143756,-0.025936,-0.628773,0.834933,1.11782
C,0.49109,-0.398178,0.737334,0.092209,0.092912
D,1.782505,1.620772,0.399676,-0.16566,3.403277
E,1.414397,-0.380957,-0.221823,-0.987242,1.03344


In [35]:
df.drop('sum',axis=1,inplace=True)

In [39]:
df

Unnamed: 0,W,X,Y,Z,sum
A,-0.476757,-0.241681,-0.727437,0.484378,-0.718438
B,1.143756,-0.025936,-0.628773,0.834933,1.11782
C,0.49109,-0.398178,0.737334,0.092209,0.092912
D,1.782505,1.620772,0.399676,-0.16566,3.403277
E,1.414397,-0.380957,-0.221823,-0.987242,1.03344


In [40]:
df.drop('sum',axis=1)

Unnamed: 0,W,X,Y,Z
A,-0.476757,-0.241681,-0.727437,0.484378
B,1.143756,-0.025936,-0.628773,0.834933
C,0.49109,-0.398178,0.737334,0.092209
D,1.782505,1.620772,0.399676,-0.16566
E,1.414397,-0.380957,-0.221823,-0.987242


In [41]:
df

Unnamed: 0,W,X,Y,Z,sum
A,-0.476757,-0.241681,-0.727437,0.484378,-0.718438
B,1.143756,-0.025936,-0.628773,0.834933,1.11782
C,0.49109,-0.398178,0.737334,0.092209,0.092912
D,1.782505,1.620772,0.399676,-0.16566,3.403277
E,1.414397,-0.380957,-0.221823,-0.987242,1.03344


In [42]:
df = df.drop('sum',axis=1)
df

Unnamed: 0,W,X,Y,Z
A,-0.476757,-0.241681,-0.727437,0.484378
B,1.143756,-0.025936,-0.628773,0.834933
C,0.49109,-0.398178,0.737334,0.092209
D,1.782505,1.620772,0.399676,-0.16566
E,1.414397,-0.380957,-0.221823,-0.987242


#### Selecting rows

In [43]:
df.loc['A']

W   -0.476757
X   -0.241681
Y   -0.727437
Z    0.484378
Name: A, dtype: float64

In [46]:
df.iloc[0]

W   -0.476757
X   -0.241681
Y   -0.727437
Z    0.484378
Name: A, dtype: float64

In [47]:
df.loc[['A','B'],['W','Z']]

Unnamed: 0,W,Z
A,-0.476757,0.484378
B,1.143756,0.834933


##### Conditional selection

In [48]:
df

Unnamed: 0,W,X,Y,Z
A,-0.476757,-0.241681,-0.727437,0.484378
B,1.143756,-0.025936,-0.628773,0.834933
C,0.49109,-0.398178,0.737334,0.092209
D,1.782505,1.620772,0.399676,-0.16566
E,1.414397,-0.380957,-0.221823,-0.987242


In [49]:
df>0

Unnamed: 0,W,X,Y,Z
A,False,False,False,True
B,True,False,False,True
C,True,False,True,True
D,True,True,True,False
E,True,False,False,False


In [50]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,,,0.484378
B,1.143756,,,0.834933
C,0.49109,,0.737334,0.092209
D,1.782505,1.620772,0.399676,
E,1.414397,,,


In [51]:
df[df>0]['Z']

A    0.484378
B    0.834933
C    0.092209
D         NaN
E         NaN
Name: Z, dtype: float64

In [52]:
countries = 'USA China India UK Scotland'.split()
countries

['USA', 'China', 'India', 'UK', 'Scotland']

In [53]:
df['countries']=countries

In [54]:
df

Unnamed: 0,W,X,Y,Z,countries
A,-0.476757,-0.241681,-0.727437,0.484378,USA
B,1.143756,-0.025936,-0.628773,0.834933,China
C,0.49109,-0.398178,0.737334,0.092209,India
D,1.782505,1.620772,0.399676,-0.16566,UK
E,1.414397,-0.380957,-0.221823,-0.987242,Scotland


In [55]:
df.set_index('countries')

Unnamed: 0_level_0,W,X,Y,Z
countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USA,-0.476757,-0.241681,-0.727437,0.484378
China,1.143756,-0.025936,-0.628773,0.834933
India,0.49109,-0.398178,0.737334,0.092209
UK,1.782505,1.620772,0.399676,-0.16566
Scotland,1.414397,-0.380957,-0.221823,-0.987242


### Multi Index

In [56]:
outside = 'India India US US China China'.split()

inside = 'HR TN CA AZ XI WU'.split()

In [57]:
outside

['India', 'India', 'US', 'US', 'China', 'China']

In [58]:
inside

['HR', 'TN', 'CA', 'AZ', 'XI', 'WU']

In [59]:
hidx = list(zip(outside,inside))
hidx

[('India', 'HR'),
 ('India', 'TN'),
 ('US', 'CA'),
 ('US', 'AZ'),
 ('China', 'XI'),
 ('China', 'WU')]

In [61]:
hidx = pd.MultiIndex.from_tuples(hidx)
hidx

MultiIndex([('India', 'HR'),
            ('India', 'TN'),
            (   'US', 'CA'),
            (   'US', 'AZ'),
            ('China', 'XI'),
            ('China', 'WU')],
           )

In [62]:
df = pd.DataFrame(np.random.randn(6,2),index=hidx,columns=['Total','Recovered'])

In [63]:
df

Unnamed: 0,Unnamed: 1,Total,Recovered
India,HR,-1.253166,-1.194912
India,TN,-0.680913,0.772337
US,CA,1.200627,0.582862
US,AZ,-1.068673,-1.262494
China,XI,0.206313,-0.560809
China,WU,-0.326789,0.840547


In [64]:
df.loc['India']

Unnamed: 0,Total,Recovered
HR,-1.253166,-1.194912
TN,-0.680913,0.772337


In [65]:
df.loc['US'].loc['AZ']

Total       -1.068673
Recovered   -1.262494
Name: AZ, dtype: float64

In [66]:
df.index.names

FrozenList([None, None])

In [67]:
df.index.names = ['Countries','States']

In [68]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Total,Recovered
Countries,States,Unnamed: 2_level_1,Unnamed: 3_level_1
India,HR,-1.253166,-1.194912
India,TN,-0.680913,0.772337
US,CA,1.200627,0.582862
US,AZ,-1.068673,-1.262494
China,XI,0.206313,-0.560809
China,WU,-0.326789,0.840547
