# Pandas

**pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language.**

https://pandas.pydata.org/

* Introduction


* Series


* Dataframes


* Missing data handling 


* GroupBy


* Operations


* Data input / output


* Merging, joining, and concatenating

#### Series

Similar to numpy array, diiferent types

Series in indexed by labels, and has axis labels

#### Import libraries

In [1]:
import numpy as np
import pandas as pd

#### Create series

In [2]:
labels = ['a','b','c']
myList = [1,2,3]
arr = np.array([10,20,30])
d = {'a':11,'b':12,'c':14}

#### Using list

In [3]:
pd.Series(myList)

0    1
1    2
2    3
dtype: int64

In [4]:
pd.Series(data=myList,index=labels)

a    1
b    2
c    3
dtype: int64

#### Using Dictionary

In [5]:
pd.Series(d)

a    11
b    12
c    14
dtype: int64

##### Operations

In [6]:
d1={"USA":12,"China":65,"India":56}
d2={"USA":12,"EU":65,"India":56}

In [7]:
ser1=pd.Series(d1)
ser2=pd.Series(d2)

In [8]:
ser1+ser2

China      NaN
EU         NaN
India    112.0
USA       24.0
dtype: float64

#### DataFrames

Main workhorse of pandas

Bunch of series objects with same index put together

In [9]:
df = pd.DataFrame(data=np.random.randn(5,4),
                  index=['A', 'B', 'C', 'D', 'E'],
                  columns='W X Y Z'.split())


#'A B C D E'.split())
#['A', 'B', 'C', 'D', 'E']

df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,-0.169281,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [10]:
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

#### Selection and Indexing

In [11]:
df['W']

A   -0.569100
B    1.708675
C   -0.686629
D    1.432320
E   -0.505227
Name: W, dtype: float64

In [12]:
df.W # sql syntax NOT RECOMMENDED

A   -0.569100
B    1.708675
C   -0.686629
D    1.432320
E   -0.505227
Name: W, dtype: float64

In [13]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-0.5691,-1.397297
B,1.708675,0.006246
C,-0.686629,-2.318457
D,1.43232,-0.350024
E,-0.505227,2.398598


In [14]:
df['new'] = df['W']+df['Z']

In [15]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.5691,-0.084155,-0.84651,-1.397297,-1.966398
B,1.708675,0.664588,-0.169281,0.006246,1.71492
C,-0.686629,-1.482483,-0.945229,-2.318457,-3.005085
D,1.43232,-0.608744,-0.988636,-0.350024,1.082296
E,-0.505227,1.398927,0.395654,2.398598,1.893371


In [16]:
df['sum']=df['X']+df['W']
df

Unnamed: 0,W,X,Y,Z,new,sum
A,-0.5691,-0.084155,-0.84651,-1.397297,-1.966398,-0.653255
B,1.708675,0.664588,-0.169281,0.006246,1.71492,2.373262
C,-0.686629,-1.482483,-0.945229,-2.318457,-3.005085,-2.169111
D,1.43232,-0.608744,-0.988636,-0.350024,1.082296,0.823577
E,-0.505227,1.398927,0.395654,2.398598,1.893371,0.893701


In [17]:
df['sum']

A   -0.653255
B    2.373262
C   -2.169111
D    0.823577
E    0.893701
Name: sum, dtype: float64

In [18]:
df.sum

<bound method DataFrame.sum of           W         X         Y         Z       new       sum
A -0.569100 -0.084155 -0.846510 -1.397297 -1.966398 -0.653255
B  1.708675  0.664588 -0.169281  0.006246  1.714920  2.373262
C -0.686629 -1.482483 -0.945229 -2.318457 -3.005085 -2.169111
D  1.432320 -0.608744 -0.988636 -0.350024  1.082296  0.823577
E -0.505227  1.398927  0.395654  2.398598  1.893371  0.893701>

In [19]:
df.drop('sum',axis=1)

Unnamed: 0,W,X,Y,Z,new
A,-0.5691,-0.084155,-0.84651,-1.397297,-1.966398
B,1.708675,0.664588,-0.169281,0.006246,1.71492
C,-0.686629,-1.482483,-0.945229,-2.318457,-3.005085
D,1.43232,-0.608744,-0.988636,-0.350024,1.082296
E,-0.505227,1.398927,0.395654,2.398598,1.893371


In [20]:
df

Unnamed: 0,W,X,Y,Z,new,sum
A,-0.5691,-0.084155,-0.84651,-1.397297,-1.966398,-0.653255
B,1.708675,0.664588,-0.169281,0.006246,1.71492,2.373262
C,-0.686629,-1.482483,-0.945229,-2.318457,-3.005085,-2.169111
D,1.43232,-0.608744,-0.988636,-0.350024,1.082296,0.823577
E,-0.505227,1.398927,0.395654,2.398598,1.893371,0.893701


In [21]:
df = df.drop('sum',axis=1)
df

Unnamed: 0,W,X,Y,Z,new
A,-0.5691,-0.084155,-0.84651,-1.397297,-1.966398
B,1.708675,0.664588,-0.169281,0.006246,1.71492
C,-0.686629,-1.482483,-0.945229,-2.318457,-3.005085
D,1.43232,-0.608744,-0.988636,-0.350024,1.082296
E,-0.505227,1.398927,0.395654,2.398598,1.893371


In [22]:
df.drop('new',axis=1,inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,-0.169281,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [23]:
df.drop('E')

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,-0.169281,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024


#### Selection

In [24]:
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,-0.169281,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [25]:
df.loc['A'] # Selection by label index

W   -0.569100
X   -0.084155
Y   -0.846510
Z   -1.397297
Name: A, dtype: float64

In [26]:
df.iloc[0] # Selection by numeric index

W   -0.569100
X   -0.084155
Y   -0.846510
Z   -1.397297
Name: A, dtype: float64

###### Selecting subsets

In [27]:
df.loc['B','Y']=0
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,0.0,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [28]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,-0.5691,-0.84651
B,1.708675,0.0


##### Conditional Selection

In [29]:
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,0.0,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [30]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
B,1.708675,0.664588,0.0,0.006246
D,1.43232,-0.608744,-0.988636,-0.350024


In [31]:
df['W']>0

A    False
B     True
C    False
D     True
E    False
Name: W, dtype: bool

In [32]:
df[df['W']>0]['Z']

B    0.006246
D   -0.350024
Name: Z, dtype: float64

##### index

In [33]:
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,0.0,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [34]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-0.5691,-0.084155,-0.84651,-1.397297
1,B,1.708675,0.664588,0.0,0.006246
2,C,-0.686629,-1.482483,-0.945229,-2.318457
3,D,1.43232,-0.608744,-0.988636,-0.350024
4,E,-0.505227,1.398927,0.395654,2.398598


In [35]:
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,0.0,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598


In [36]:
#df.reset_index(inplace=True)

In [37]:
df

Unnamed: 0,W,X,Y,Z
A,-0.5691,-0.084155,-0.84651,-1.397297
B,1.708675,0.664588,0.0,0.006246
C,-0.686629,-1.482483,-0.945229,-2.318457
D,1.43232,-0.608744,-0.988636,-0.350024
E,-0.505227,1.398927,0.395654,2.398598
