## Pandas

In [1]:
import pandas as pd

#### Series

In [2]:
series = pd.Series([12,11,14,15,10,16])
print(type(series))
series

<class 'pandas.core.series.Series'>


0    12
1    11
2    14
3    15
4    10
5    16
dtype: int64

In [3]:
print(series[3:])
print(series[::-1])
print(series[::2])
print(series[2:4])

3    15
4    10
5    16
dtype: int64
5    16
4    10
3    15
2    14
1    11
0    12
dtype: int64
0    12
2    14
4    10
dtype: int64
2    14
3    15
dtype: int64


In [4]:
#Labels(names) can be assigned to indexes as well

series = pd.Series([12,11,14], index=['a','b','c'])
print(series)
print(series['b'])

a    12
b    11
c    14
dtype: int64
11


#### DataFrames

In [5]:
df = pd.DataFrame([[5,4,6],[3,1,2],[8,4,2],[7,9,2],[3,9,4]], index=['x1','x2','x3','x4','x5'], columns=['col1','col2','col3'], dtype=int)
df

Unnamed: 0,col1,col2,col3
x1,5,4,6
x2,3,1,2
x3,8,4,2
x4,7,9,2
x5,3,9,4


In [6]:
print(len(df)) #no.of.rows
print(df.shape)
print(df.index)
print(df.columns)
print(df.values)

5
(5, 3)
Index(['x1', 'x2', 'x3', 'x4', 'x5'], dtype='object')
Index(['col1', 'col2', 'col3'], dtype='object')
[[5 4 6]
 [3 1 2]
 [8 4 2]
 [7 9 2]
 [3 9 4]]


In [7]:
print(type(df.values))

<class 'numpy.ndarray'>


In [8]:
import numpy as np
ar = np.array([[3,4,5],[1,2,6],[8,9,7]], dtype=int)
df = pd.DataFrame(ar, columns=['col1', 'col2', 'col3'])
df

Unnamed: 0,col1,col2,col3
0,3,4,5
1,1,2,6
2,8,9,7


In [9]:
myDict = {"Id": [101, 102, 103], "Name": ["Paul", "John", "Tina"]}
df = pd.DataFrame(data = myDict)
df

Unnamed: 0,Id,Name
0,101,Paul
1,102,John
2,103,Tina


#### Custom DataType

In [10]:
customer_dtype = [("Name", 'S10'), ("Age", int), ("isActive", bool)]
customer_values = [("Paul", 36, True), ("Tina", 32, True), ("John", 45, False), ("Ken", 23, True)]

customers_array = np.array(customer_values, dtype=customer_dtype)

customers_df = pd.DataFrame(data = customers_array)
customers_df

Unnamed: 0,Name,Age,isActive
0,b'Paul',36,True
1,b'Tina',32,True
2,b'John',45,False
3,b'Ken',23,True


### Reshaping Data - Change layout, sorting, reindexing, renaming

In [11]:
melted_df = pd.melt(df)
melted_df

Unnamed: 0,variable,value
0,Id,101
1,Id,102
2,Id,103
3,Name,Paul
4,Name,John
5,Name,Tina


In [12]:
df = pd.DataFrame([('A',1,True),('B',2,True),('C',2,False),('D',3,False),('E',1,True)], columns=['id','category','isActive'])
df

Unnamed: 0,id,category,isActive
0,A,1,True
1,B,2,True
2,C,2,False
3,D,3,False
4,E,1,True


In [13]:
df.pivot(columns='isActive', values='category', index='id')

isActive,False,True
id,Unnamed: 1_level_1,Unnamed: 2_level_1
A,,1.0
B,,2.0
C,2.0,
D,3.0,
E,,1.0


In [14]:
df1 = pd.DataFrame([(1,2),(2,3),(4,5),(5,6)], columns=['col1','col2'], index=['r1','r2','r3','r4'], dtype=np.float32)
df2 = pd.DataFrame([(6,7,9),(7,8,10)], columns=['col1','col2','col3'], index=['r5','r6'], dtype=np.float32)

In [15]:
df1

Unnamed: 0,col1,col2
r1,1.0,2.0
r2,2.0,3.0
r3,4.0,5.0
r4,5.0,6.0


In [16]:
df2

Unnamed: 0,col1,col2,col3
r5,6.0,7.0,9.0
r6,7.0,8.0,10.0


In [17]:
pd.concat([df1, df2])

Unnamed: 0,col1,col2,col3
r1,1.0,2.0,
r2,2.0,3.0,
r3,4.0,5.0,
r4,5.0,6.0,
r5,6.0,7.0,9.0
r6,7.0,8.0,10.0


In [18]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,col1,col2,col1.1,col2.1,col3
r1,1.0,2.0,,,
r2,2.0,3.0,,,
r3,4.0,5.0,,,
r4,5.0,6.0,,,
r5,,,6.0,7.0,9.0
r6,,,7.0,8.0,10.0


In [19]:
df = pd.DataFrame([[5,4,6],[3,1,4],[8,2,5]], index=['r1','r2','r3'], columns=['col1','col2','col3'], dtype=int)
df

Unnamed: 0,col1,col2,col3
r1,5,4,6
r2,3,1,4
r3,8,2,5


In [20]:
df.sort_values('col2')

Unnamed: 0,col1,col2,col3
r2,3,1,4
r3,8,2,5
r1,5,4,6


In [21]:
df.sort_values('col2', ascending=False)

Unnamed: 0,col1,col2,col3
r1,5,4,6
r3,8,2,5
r2,3,1,4


In [22]:
df.rename(columns = {'col1':'c1', 'col2':'c2'})

Unnamed: 0,c1,c2,col3
r1,5,4,6
r2,3,1,4
r3,8,2,5


In [23]:
df.drop(columns=['col3'])

Unnamed: 0,col1,col2
r1,5,4
r2,3,1
r3,8,2


In [24]:
df.sort_index()

Unnamed: 0,col1,col2,col3
r1,5,4,6
r2,3,1,4
r3,8,2,5


In [25]:
df.reset_index()

Unnamed: 0,index,col1,col2,col3
0,r1,5,4,6
1,r2,3,1,4
2,r3,8,2,5


### Subsetting and Slicing

In [216]:
import numpy as np
data = dict({'animal': ['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
          'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
          'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
          'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no' ]})
labels = ['a','b','c','d','e','f','g','h','i','j']

In [217]:
df = pd.DataFrame(data=data, index=labels)
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [47]:
df.head()

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no


In [48]:
df.head(2)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes


In [49]:
df.tail()

Unnamed: 0,animal,age,visits,priority
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [50]:
df.tail(3)

Unnamed: 0,animal,age,visits,priority
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [51]:
df.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

In [52]:
df[df.animal == "snake"]

Unnamed: 0,animal,age,visits,priority
c,snake,0.5,2,no
g,snake,4.5,1,no


In [53]:
df[df.age >= 5]

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
i,dog,7.0,2,no


In [57]:
#df.drop_duplicates() #Consider all the columns and remove duplicate records
df.drop_duplicates(subset=['animal','visits'])

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no


In [67]:
df.sample(n=4) #randomly select 4 records from dataframe

Unnamed: 0,animal,age,visits,priority
g,snake,4.5,1,no
a,cat,2.5,1,yes
h,cat,,1,yes
d,dog,,3,yes


In [69]:
df.sample(frac=0.25) #randomly select 25% of the records from dataframe

Unnamed: 0,animal,age,visits,priority
j,dog,3.0,1,no
b,cat,3.0,3,yes


In [72]:
df.nlargest(n=3, columns='age')

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no


In [73]:
df.nsmallest(n=2, columns='age')

Unnamed: 0,animal,age,visits,priority
c,snake,0.5,2,no
f,cat,2.0,3,no


In [80]:
df['animal']
#df.animal

a      cat
b      cat
c    snake
d      dog
e      dog
f      cat
g    snake
h      cat
i      dog
j      dog
Name: animal, dtype: object

In [82]:
df[['animal','visits']]

Unnamed: 0,animal,visits
a,cat,1
b,cat,3
c,snake,2
d,dog,3
e,dog,2
f,cat,3
g,snake,1
h,cat,1
i,dog,2
j,dog,1


#### Check null records for specified column

In [128]:
df[['age']].isnull()

Unnamed: 0,age
a,False
b,False
c,False
d,True
e,False
f,False
g,False
h,True
i,False
j,False


#### Count total number of null records

In [129]:
df[['age']].isnull().sum() #Counts total number of True's

age    2
dtype: int64

#### Fetch records which has null records for specific column

In [130]:
df[df.age.isnull()]

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [119]:
df[['age']].notnull().sum() #Counts total number of non null records in age column

age    8
dtype: int64

In [125]:
df[['visits']].isin([2,3]).sum()

visits    6
dtype: int64

In [127]:
df[df.visits.isin([2,3])]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
i,dog,7.0,2,no


#### Select columns based on regular expression

In [87]:
df.filter(regex='^a').head(3)

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5


In [88]:
df.filter(regex='y$').tail(2)

Unnamed: 0,priority
i,no
j,no


#### Query the data from dataframe

In [92]:
df.query('age >= 4')

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
g,snake,4.5,1,no
i,dog,7.0,2,no


In [99]:
df.query('age >= 4 and visits > 1')

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
i,dog,7.0,2,no


In [103]:
df.query('animal.str.startswith("ca")', engine='python')

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
h,cat,,1,yes


In [109]:
df.query('priority.str.isalpha()', engine='python').head(2)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes


#### Access single value from dataframe - at, iat

In [138]:
df.at['a', 'animal']

'cat'

In [140]:
df.iat[0, 0]

'cat'

#### Access subsets (muliple values) from dataframe - loc, iloc

#### loc

In [153]:
df.loc[:,:]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [155]:
df.loc[:,['animal']]

Unnamed: 0,animal
a,cat
b,cat
c,snake
d,dog
e,dog
f,cat
g,snake
h,cat
i,dog
j,dog


In [151]:
df.loc[:,['animal','age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [156]:
df.loc[:,'animal':'visits']

Unnamed: 0,animal,age,visits
a,cat,2.5,1
b,cat,3.0,3
c,snake,0.5,2
d,dog,,3
e,dog,5.0,2
f,cat,2.0,3
g,snake,4.5,1
h,cat,,1
i,dog,7.0,2
j,dog,3.0,1


In [147]:
df.loc[['a','d','g'],['animal','age','priority']]

Unnamed: 0,animal,age,priority
a,cat,2.5,yes
d,dog,,yes
g,snake,4.5,no


In [152]:
df.loc['c':'f',['animal','age','priority']]

Unnamed: 0,animal,age,priority
c,snake,0.5,no
d,dog,,yes
e,dog,5.0,no
f,cat,2.0,no


#### iloc

In [158]:
df.iloc[:,:]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [159]:
df.iloc[:,[0,1]]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [161]:
df.iloc[:,0:3]

Unnamed: 0,animal,age,visits
a,cat,2.5,1
b,cat,3.0,3
c,snake,0.5,2
d,dog,,3
e,dog,5.0,2
f,cat,2.0,3
g,snake,4.5,1
h,cat,,1
i,dog,7.0,2
j,dog,3.0,1


In [162]:
df.iloc[[2,4,5],0:3]

Unnamed: 0,animal,age,visits
c,snake,0.5,2
e,dog,5.0,2
f,cat,2.0,3


In [163]:
df.iloc[3:6,0:3]

Unnamed: 0,animal,age,visits
d,dog,,3
e,dog,5.0,2
f,cat,2.0,3


### Summarize Data

In [170]:
len(df)

10

In [171]:
df.shape

(10, 4)

In [176]:
df.describe() #works on only numerical columns

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [173]:
df['animal'].nunique()

3

In [174]:
df['animal'].value_counts()

animal
cat      4
dog      4
snake    2
Name: count, dtype: int64

In [190]:
print("Sum:", df['age'].sum())
print("Count:", df['age'].count())  #Count Non Null Values
print("Min:", df['age'].min())
print("Max:", df['age'].max())

print("Mean/Average:", df['age'].mean()) #Average
print("Median:", df['age'].median())
print("Variance:", df['age'].var())
print("Standard Deviation:", df['age'].std())

print("Quantiles:")
print(df['age'].quantile([0.25, 0.75, 0.9]))

Sum: 27.5
Count: 8
Min: 0.5
Max: 7.0
Mean/Average: 3.4375
Median: 3.0
Variance: 4.03125
Standard Deviation: 2.0077973005261263
Quantiles:
0.25    2.375
0.75    4.625
0.90    5.600
Name: age, dtype: float64


#### Apply - Map the values with a custom function

In [191]:
df['visits'].apply(lambda x : x*10)

a    10
b    30
c    20
d    30
e    20
f    30
g    10
h    10
i    20
j    10
Name: visits, dtype: int64

### Handling Missing Data

In [200]:
df.isnull()

Unnamed: 0,animal,age,visits,priority
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,True,False,False
e,False,False,False,False
f,False,False,False,False
g,False,False,False,False
h,False,True,False,False
i,False,False,False,False
j,False,False,False,False


In [201]:
df.notnull()

Unnamed: 0,animal,age,visits,priority
a,True,True,True,True
b,True,True,True,True
c,True,True,True,True
d,True,False,True,True
e,True,True,True,True
f,True,True,True,True
g,True,True,True,True
h,True,False,True,True
i,True,True,True,True
j,True,True,True,True


In [202]:
df.dropna()

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
i,dog,7.0,2,no
j,dog,3.0,1,no


In [204]:
df.fillna(0.0)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,0.0,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,0.0,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


### Managing Dataframe Columns

#### Adding new column

In [218]:
df['x'] = df['age'] * df['visits']
df

Unnamed: 0,animal,age,visits,priority,x
a,cat,2.5,1,yes,2.5
b,cat,3.0,3,yes,9.0
c,snake,0.5,2,no,1.0
d,dog,,3,yes,
e,dog,5.0,2,no,10.0
f,cat,2.0,3,no,6.0
g,snake,4.5,1,no,4.5
h,cat,,1,yes,
i,dog,7.0,2,no,14.0
j,dog,3.0,1,no,3.0


#### Removing the columns

In [223]:
df.drop(columns=['priority','x'], inplace=True)

In [225]:
df

Unnamed: 0,animal,age,visits
a,cat,2.5,1
b,cat,3.0,3
c,snake,0.5,2
d,dog,,3
e,dog,5.0,2
f,cat,2.0,3
g,snake,4.5,1
h,cat,,1
i,dog,7.0,2
j,dog,3.0,1


In [226]:
df.fillna(0, inplace=True)

In [227]:
df

Unnamed: 0,animal,age,visits
a,cat,2.5,1
b,cat,3.0,3
c,snake,0.5,2
d,dog,0.0,3
e,dog,5.0,2
f,cat,2.0,3
g,snake,4.5,1
h,cat,0.0,1
i,dog,7.0,2
j,dog,3.0,1


### Group Data

In [234]:
grouped_df = df.groupby(by='animal')

In [235]:
grouped_df.size()

animal
cat      4
dog      4
snake    2
dtype: int64

In [236]:
grouped_df.sum()

Unnamed: 0_level_0,age,visits
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,7.5,8
dog,15.0,8
snake,5.0,3


In [239]:
grouped_df.max()

Unnamed: 0_level_0,age,visits
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,3.0,3
dog,7.0,3
snake,4.5,2


In [241]:
grouped_df.agg(['max','min'])

Unnamed: 0_level_0,age,age,visits,visits
Unnamed: 0_level_1,max,min,max,min
animal,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
cat,3.0,0.0,3,1
dog,7.0,0.0,3,1
snake,4.5,0.5,2,1


In [245]:
grouped_df.agg({'age':'sum', 'visits': ['max','min']})

Unnamed: 0_level_0,age,visits,visits
Unnamed: 0_level_1,sum,max,min
animal,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
cat,7.5,3,1
dog,15.0,3,1
snake,5.0,2,1


In [251]:
grouped_df.agg(lambda x: sum(x))

Unnamed: 0_level_0,age,visits
animal,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,7.5,8
dog,15.0,8
snake,5.0,3


### Combine Data Sets

In [252]:
df1 = pd.DataFrame([['A',1],['B',2],['C',3]], columns=['x1','x2'])
df1

Unnamed: 0,x1,x2
0,A,1
1,B,2
2,C,3


In [253]:
df2 = pd.DataFrame([['A','T'],['B','F'],['D','T']], columns=['x1','x3'])
df2

Unnamed: 0,x1,x3
0,A,T
1,B,F
2,D,T


#### Inner Join

In [254]:
pd.merge(df1, df2, how="inner", on='x1')

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F


#### Left Join

In [255]:
pd.merge(df1, df2, how="left", on='x1')

Unnamed: 0,x1,x2,x3
0,A,1,T
1,B,2,F
2,C,3,


#### Right Join

In [256]:
pd.merge(df1, df2, how="right", on='x1')

Unnamed: 0,x1,x2,x3
0,A,1.0,T
1,B,2.0,F
2,D,,T


#### Full Outer Join

In [257]:
pd.merge(df1, df2, how="outer", on='x1')

Unnamed: 0,x1,x2,x3
0,A,1.0,T
1,B,2.0,F
2,C,3.0,
3,D,,T


### Sets

In [258]:
df1 = pd.DataFrame([['A',1],['B',2],['C',3]], columns=['x1','x2'])
df1

Unnamed: 0,x1,x2
0,A,1
1,B,2
2,C,3


In [259]:
df2 = pd.DataFrame([['B',2],['C',3],['D',4]], columns=['x1','x2'])
df2

Unnamed: 0,x1,x2
0,B,2
1,C,3
2,D,4


#### Set Intersection

In [260]:
pd.merge(df1, df2)

Unnamed: 0,x1,x2
0,B,2
1,C,3


#### Set Union

In [261]:
pd.merge(df1, df2, how='outer')

Unnamed: 0,x1,x2
0,A,1
1,B,2
2,C,3
3,D,4


#### Set Difference

In [263]:
pd.merge(df1, df2, how='outer', indicator=True)

Unnamed: 0,x1,x2,_merge
0,A,1,left_only
1,B,2,both
2,C,3,both
3,D,4,right_only


In [264]:
pd.merge(df1, df2, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])

Unnamed: 0,x1,x2
0,A,1
