# Pandas Python

## Pandas Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Passing ndarray to Pandas series
d1 = np.arange(100,110)
pd.Series(d1)

0    100
1    101
2    102
3    103
4    104
5    105
6    106
7    107
8    108
9    109
dtype: int64

In [3]:
#Customized Index values
d2 = np.array(['a','b','c','d','e'])
pd.Series(d2,index=[1,2,3,4,5])

1    a
2    b
3    c
4    d
5    e
dtype: object

In [4]:
#with Dictionary
d3 = {"a":1,"b":2,"c":3,"d":4,"e":5}
pd.Series(d3)

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
#Retrive elements
data = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
s = pd.Series(data)

In [6]:
s[:3]

0    a
1    b
2    c
dtype: object

In [7]:
s[-3:]

5    f
6    g
7    h
dtype: object

In [8]:
s[[0,2,4]]

0    a
2    c
4    e
dtype: object

## Pandas Dataframe

In [9]:
#creating a Dataframe
df = pd.DataFrame(
                  {"Name":["a","b","c","d"],
                  "Age" : [25,24,19,24]},
                   index =[1,2,3,4])
df

Unnamed: 0,Name,Age
1,a,25
2,b,24
3,c,19
4,d,24


In [10]:
df = pd.DataFrame([['Ab', 25], ['Bb', 24], ['Cd', 19], ['Dd', 24]], columns=['Name','Age'])

In [11]:
df

Unnamed: 0,Name,Age
0,Ab,25
1,Bb,24
2,Cd,19
3,Dd,24


## Data wrangling with pandas

In [12]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings', 'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
            'Rank': [1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
            'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
            'Points':[876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]}
df = pd.DataFrame(ipl_data)

In [13]:
df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741
5,kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


In [14]:
df.dtypes

Team      object
Rank       int64
Year       int64
Points     int64
dtype: object

In [15]:
df.ndim

2

In [16]:
df.shape

(12, 4)

In [17]:
len(df)

12

In [18]:
df.size

48

In [19]:
df.values

array([['Riders', 1, 2014, 876],
       ['Riders', 2, 2015, 789],
       ['Devils', 2, 2014, 863],
       ['Devils', 3, 2015, 673],
       ['Kings', 3, 2014, 741],
       ['kings', 4, 2015, 812],
       ['Kings', 1, 2016, 756],
       ['Kings', 1, 2017, 788],
       ['Riders', 2, 2016, 694],
       ['Royals', 4, 2014, 701],
       ['Royals', 1, 2015, 804],
       ['Riders', 2, 2017, 690]], dtype=object)

In [20]:
df.head()

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741


In [21]:
df.tail(2)

Unnamed: 0,Team,Rank,Year,Points
10,Royals,1,2015,804
11,Riders,2,2017,690


In [22]:
#summerize data
grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78]

df = pd.DataFrame({'ID': ["x%d" % r for r in range(10)],
                   'Gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'],
                   'ExamYear': ['2007', '2007', '2007', '2008', '2008', '2008', '2008', '2009', '2009', '2009'],
                   'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'],
                   'Participated': ['yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes'],
                   'Passed': ['yes' if x > 50 else 'no' for x in grades],
                   'Employed': [True, True, True, False, False, False, False, True, True, False],
                   'Grade': grades})

In [23]:
df

Unnamed: 0,ID,Gender,ExamYear,Class,Participated,Passed,Employed,Grade
0,x0,F,2007,algebra,yes,no,True,48
1,x1,M,2007,stats,yes,yes,True,99
2,x2,F,2007,bio,yes,yes,True,75
3,x3,M,2008,algebra,yes,yes,False,80
4,x4,F,2008,algebra,no,no,False,42
5,x5,M,2008,stats,yes,yes,False,80
6,x6,F,2008,stats,yes,yes,False,72
7,x7,M,2009,algebra,yes,yes,True,68
8,x8,M,2009,bio,yes,no,True,36
9,x9,M,2009,bio,yes,yes,False,78


In [24]:
df['Grade'].value_counts()

80    2
78    1
75    1
42    1
36    1
72    1
68    1
99    1
48    1
Name: Grade, dtype: int64

In [25]:
df['ExamYear'].nunique()

3

In [26]:
df.describe()

Unnamed: 0,Grade
count,10.0
mean,67.8
std,19.758542
min,36.0
25%,53.0
50%,73.5
75%,79.5
max,99.0


In [27]:
#selectin the Data
df['Class']

0    algebra
1      stats
2        bio
3    algebra
4    algebra
5      stats
6      stats
7    algebra
8        bio
9        bio
Name: Class, dtype: object

In [28]:
df[['ID','Class','Grade']]

Unnamed: 0,ID,Class,Grade
0,x0,algebra,48
1,x1,stats,99
2,x2,bio,75
3,x3,algebra,80
4,x4,algebra,42
5,x5,stats,80
6,x6,stats,72
7,x7,algebra,68
8,x8,bio,36
9,x9,bio,78


In [29]:
df.Grade

0    48
1    99
2    75
3    80
4    42
5    80
6    72
7    68
8    36
9    78
Name: Grade, dtype: int64

In [30]:
#subset observation
df[df.Gender == 'M']

Unnamed: 0,ID,Gender,ExamYear,Class,Participated,Passed,Employed,Grade
1,x1,M,2007,stats,yes,yes,True,99
3,x3,M,2008,algebra,yes,yes,False,80
5,x5,M,2008,stats,yes,yes,False,80
7,x7,M,2009,algebra,yes,yes,True,68
8,x8,M,2009,bio,yes,no,True,36
9,x9,M,2009,bio,yes,yes,False,78


In [31]:
df[(df.Grade> 20) & (df.ExamYear == '2008') & (df.Participated == 'yes')]

Unnamed: 0,ID,Gender,ExamYear,Class,Participated,Passed,Employed,Grade
3,x3,M,2008,algebra,yes,yes,False,80
5,x5,M,2008,stats,yes,yes,False,80
6,x6,F,2008,stats,yes,yes,False,72


In [32]:
#index based slicing

In [33]:
grades = [48, 99, 75, 80, 42, 80, 72, 68]

df = pd.DataFrame({'ID': ["x%d" % r for r in range(8)],
                   'Gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M'],
                   'ExamYear': ['2007', '2007', '2007', '2008', '2008', '2008', '2008', '2009'],
                   'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra'],
                   'Participated': ['yes', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes'],
                   'Passed': ['yes' if x > 50 else 'no' for x in grades],
                   'Employed': [True, True, True, False, False, False, False, True],
                   'Grade': grades},
                 index = ["x%d" % r for r in range(8)])

In [34]:
df

Unnamed: 0,ID,Gender,ExamYear,Class,Participated,Passed,Employed,Grade
x0,x0,F,2007,algebra,yes,no,True,48
x1,x1,M,2007,stats,yes,yes,True,99
x2,x2,F,2007,bio,yes,yes,True,75
x3,x3,M,2008,algebra,yes,yes,False,80
x4,x4,F,2008,algebra,no,no,False,42
x5,x5,M,2008,stats,yes,yes,False,80
x6,x6,F,2008,stats,yes,yes,False,72
x7,x7,M,2009,algebra,yes,yes,True,68


In [35]:
df.loc[:,'Grade']

x0    48
x1    99
x2    75
x3    80
x4    42
x5    80
x6    72
x7    68
Name: Grade, dtype: int64

In [36]:
df.loc[:,['ID', 'Grade']]

Unnamed: 0,ID,Grade
x0,x0,48
x1,x1,99
x2,x2,75
x3,x3,80
x4,x4,42
x5,x5,80
x6,x6,72
x7,x7,68


In [37]:
df.loc['x2']

ID                x2
Gender             F
ExamYear        2007
Class            bio
Participated     yes
Passed           yes
Employed        True
Grade             75
Name: x2, dtype: object

In [38]:
df.loc[['x1', 'x3', 'x5'],['ID', 'Grade','ExamYear']]

Unnamed: 0,ID,Grade,ExamYear
x1,x1,99,2007
x3,x3,80,2008
x5,x5,80,2008


In [39]:
#integer based iloc
df.iloc[:4]

Unnamed: 0,ID,Gender,ExamYear,Class,Participated,Passed,Employed,Grade
x0,x0,F,2007,algebra,yes,no,True,48
x1,x1,M,2007,stats,yes,yes,True,99
x2,x2,F,2007,bio,yes,yes,True,75
x3,x3,M,2008,algebra,yes,yes,False,80


In [40]:
df

Unnamed: 0,ID,Gender,ExamYear,Class,Participated,Passed,Employed,Grade
x0,x0,F,2007,algebra,yes,no,True,48
x1,x1,M,2007,stats,yes,yes,True,99
x2,x2,F,2007,bio,yes,yes,True,75
x3,x3,M,2008,algebra,yes,yes,False,80
x4,x4,F,2008,algebra,no,no,False,42
x5,x5,M,2008,stats,yes,yes,False,80
x6,x6,F,2008,stats,yes,yes,False,72
x7,x7,M,2009,algebra,yes,yes,True,68


In [41]:
df.iloc[1:5, 2:7]

Unnamed: 0,ExamYear,Class,Participated,Passed,Employed
x1,2007,stats,yes,yes,True
x2,2007,bio,yes,yes,True
x3,2008,algebra,yes,yes,False
x4,2008,algebra,no,no,False


### Handling missing data

In [42]:
df = pd.DataFrame(
    {
        "Name": ['Ab', 'Bb', 'Cd', 'Dd', 'Ed', 'Fc'],
        "Age" : [25, 24, 19, 24, np.nan, 28],
        "Score" : [78, 84, 89, 74, 69, np.nan]},
    index = [1, 2, 3, 4, 5, 6])    

In [43]:
df

Unnamed: 0,Name,Age,Score
1,Ab,25.0,78.0
2,Bb,24.0,84.0
3,Cd,19.0,89.0
4,Dd,24.0,74.0
5,Ed,,69.0
6,Fc,28.0,


In [44]:
df.dropna() #drop row cloumn with row data

Unnamed: 0,Name,Age,Score
1,Ab,25.0,78.0
2,Bb,24.0,84.0
3,Cd,19.0,89.0
4,Dd,24.0,74.0


In [45]:
df['Age'].fillna(value = 15)

1    25.0
2    24.0
3    19.0
4    24.0
5    15.0
6    28.0
Name: Age, dtype: float64

In [46]:
df

Unnamed: 0,Name,Age,Score
1,Ab,25.0,78.0
2,Bb,24.0,84.0
3,Cd,19.0,89.0
4,Dd,24.0,74.0
5,Ed,,69.0
6,Fc,28.0,


In [47]:
df.Age.mean()

24.0

In [48]:
df2 = df['Age'].fillna(value = df.Age.mean())

In [49]:
df2

1    25.0
2    24.0
3    19.0
4    24.0
5    24.0
6    28.0
Name: Age, dtype: float64

In [50]:
df['Age'] = df['Age'].fillna(value = df.Age.mean())

In [51]:
df

Unnamed: 0,Name,Age,Score
1,Ab,25.0,78.0
2,Bb,24.0,84.0
3,Cd,19.0,89.0
4,Dd,24.0,74.0
5,Ed,24.0,69.0
6,Fc,28.0,


In [52]:
df['Score'] = df['Age'].fillna(value=df.Age.mean())

In [53]:
df

Unnamed: 0,Name,Age,Score
1,Ab,25.0,25.0
2,Bb,24.0,24.0
3,Cd,19.0,19.0
4,Dd,24.0,24.0
5,Ed,24.0,24.0
6,Fc,28.0,28.0


In [54]:
pd.isnull(df)

Unnamed: 0,Name,Age,Score
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False


In [55]:
#wide long form
data = {'Name': ['John', 'Smith', 'Liz'], 
        'Weight': [150, 170, 110], 
        'BP': [120, 130, 100]}

w_df = pd.DataFrame(data)

In [56]:
w_df

Unnamed: 0,Name,Weight,BP
0,John,150,120
1,Smith,170,130
2,Liz,110,100


In [57]:
w_df.melt(id_vars='Name',var_name='key',value_name='value')

Unnamed: 0,Name,key,value
0,John,Weight,150
1,Smith,Weight,170
2,Liz,Weight,110
3,John,BP,120
4,Smith,BP,130
5,Liz,BP,100


In [58]:
data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2], 
        'treatment': [0, 1, 0, 1, 0],
        'score': [6252, 24243, 2345, 2342, 23525]}

Long_df = pd.DataFrame(data, columns = ['patient', 'obs', 'treatment', 'score'])

In [59]:
Long_df

Unnamed: 0,patient,obs,treatment,score
0,1,1,0,6252
1,1,2,1,24243
2,1,3,0,2345
3,2,1,1,2342
4,2,2,0,23525


In [60]:
Long_df.pivot(index='patient', columns='obs', values='score')

obs,1,2,3
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6252.0,24243.0,2345.0
2,2342.0,23525.0,


In [61]:
#making a new column

In [62]:
data = {'Name': ['John', 'Smith', 'Liz', 'Andy', 'Dri'], 
        'Weight': [150, 170, 110, 56, 75], 
        'BP': [120, 130, 100, 110, 125]}

df = pd.DataFrame(data)

In [63]:
df

Unnamed: 0,Name,Weight,BP
0,John,150,120
1,Smith,170,130
2,Liz,110,100
3,Andy,56,110
4,Dri,75,125


In [64]:
df = df.assign(BPw = lambda df: df.Weight / df.BP)

In [65]:
df

Unnamed: 0,Name,Weight,BP,BPw
0,John,150,120,1.25
1,Smith,170,130,1.307692
2,Liz,110,100,1.1
3,Andy,56,110,0.509091
4,Dri,75,125,0.6


In [66]:
df['BP2'] = df['Weight'] / df['BP']

In [67]:
df

Unnamed: 0,Name,Weight,BP,BPw,BP2
0,John,150,120,1.25,1.25
1,Smith,170,130,1.307692,1.307692
2,Liz,110,100,1.1,1.1
3,Andy,56,110,0.509091,0.509091
4,Dri,75,125,0.6,0.6


In [68]:
df['BP_hl'] = np.where(df['BP'] > 100 , 'high' , 'low')

In [69]:
df

Unnamed: 0,Name,Weight,BP,BPw,BP2,BP_hl
0,John,150,120,1.25,1.25,high
1,Smith,170,130,1.307692,1.307692,high
2,Liz,110,100,1.1,1.1,low
3,Andy,56,110,0.509091,0.509091,high
4,Dri,75,125,0.6,0.6,high


In [70]:
df.assign(st_weight = lambda df:df.Weight /df.Weight.sum())

Unnamed: 0,Name,Weight,BP,BPw,BP2,BP_hl,st_weight
0,John,150,120,1.25,1.25,high,0.26738
1,Smith,170,130,1.307692,1.307692,high,0.30303
2,Liz,110,100,1.1,1.1,low,0.196078
3,Andy,56,110,0.509091,0.509091,high,0.099822
4,Dri,75,125,0.6,0.6,high,0.13369


In [71]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings', 'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
            'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
            'Points':[876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]}
df = pd.DataFrame(ipl_data)

In [72]:
df

Unnamed: 0,Team,Year,Points
0,Riders,2014,876
1,Riders,2015,789
2,Devils,2014,863
3,Devils,2015,673
4,Kings,2014,741
5,Kings,2015,812
6,Kings,2016,756
7,Kings,2017,788
8,Riders,2016,694
9,Royals,2014,701


In [73]:
df.groupby(['Team']).sum()

Unnamed: 0_level_0,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Devils,4029,1536
Kings,8062,3097
Riders,8062,3049
Royals,4029,1505


In [74]:
df['ratio'] = df.groupby(['Team'], group_keys=False).apply(lambda g: g.Points/(g.Points).sum())

In [75]:
df

Unnamed: 0,Team,Year,Points,ratio
0,Riders,2014,876,0.287307
1,Riders,2015,789,0.258773
2,Devils,2014,863,0.561849
3,Devils,2015,673,0.438151
4,Kings,2014,741,0.239264
5,Kings,2015,812,0.262189
6,Kings,2016,756,0.244107
7,Kings,2017,788,0.25444
8,Riders,2016,694,0.227616
9,Royals,2014,701,0.465781


In [76]:
#concatenation
df_one = pd.DataFrame(
    {'Name': ['A1', 'A2', 'A3', 'A4', 'A5'],
     'subject_id':['S01','S02','S03','S04','S05'],
     'Marks_scored':[78, 50, 77, 69, 78]},
    index=[1, 2, 3, 4, 5])

In [77]:
df_two = pd.DataFrame(
    {'Name': ['B1', 'B2', 'B3', 'B4', 'B5'],
     'subject_id':['S01','S02','S03','S04','S05'],
     'Marks_scored':[89, 85, 78, 87, 88]},
    index=[1, 2, 3, 4, 5])

In [78]:
pd.concat([df_one,df_two])

Unnamed: 0,Name,subject_id,Marks_scored
1,A1,S01,78
2,A2,S02,50
3,A3,S03,77
4,A4,S04,69
5,A5,S05,78
1,B1,S01,89
2,B2,S02,85
3,B3,S03,78
4,B4,S04,87
5,B5,S05,88


In [79]:
pd.concat([df_one, df_two], keys=['x','y'])

Unnamed: 0,Unnamed: 1,Name,subject_id,Marks_scored
x,1,A1,S01,78
x,2,A2,S02,50
x,3,A3,S03,77
x,4,A4,S04,69
x,5,A5,S05,78
y,1,B1,S01,89
y,2,B2,S02,85
y,3,B3,S03,78
y,4,B4,S04,87
y,5,B5,S05,88


In [80]:
pd.concat([df_one, df_two], keys=['x','y'], ignore_index=True)

Unnamed: 0,Name,subject_id,Marks_scored
0,A1,S01,78
1,A2,S02,50
2,A3,S03,77
3,A4,S04,69
4,A5,S05,78
5,B1,S01,89
6,B2,S02,85
7,B3,S03,78
8,B4,S04,87
9,B5,S05,88


In [81]:
#append columns of Dataframe
pd.concat([df_one, df_two], axis = 1)

Unnamed: 0,Name,subject_id,Marks_scored,Name.1,subject_id.1,Marks_scored.1
1,A1,S01,78,B1,S01,89
2,A2,S02,50,B2,S02,85
3,A3,S03,77,B3,S03,78
4,A4,S04,69,B4,S04,87
5,A5,S05,78,B5,S05,88


In [82]:
#Merging Dataframe

In [83]:
df_left = pd.DataFrame(
    {'Student': ['St01', 'St02', 'St03', 'St01'],
     'Subject': ['Mat', 'Phy', 'Phy', 'Phy'],
     'Assign1': [54, 63, 56, 78],
     'Assign2': [66, 65, 75, 85]})

df_right = pd.DataFrame(
    {'Student': ['St01', 'St02', 'St01', 'St02'],
     'Subject': ['Mat', 'Mat', 'Phy', 'Phy'],
     'Assign3': [72, 56, 85, 96],
     'Assign4': [78, 89, 56, 88]})

In [84]:
df_left

Unnamed: 0,Student,Subject,Assign1,Assign2
0,St01,Mat,54,66
1,St02,Phy,63,65
2,St03,Phy,56,75
3,St01,Phy,78,85


In [85]:
df_right

Unnamed: 0,Student,Subject,Assign3,Assign4
0,St01,Mat,72,78
1,St02,Mat,56,89
2,St01,Phy,85,56
3,St02,Phy,96,88


In [86]:
pd.merge(df_left,df_right , on=['Student','Subject'])

Unnamed: 0,Student,Subject,Assign1,Assign2,Assign3,Assign4
0,St01,Mat,54,66,72,78
1,St02,Phy,63,65,96,88
2,St01,Phy,78,85,85,56


In [87]:
df_right

Unnamed: 0,Student,Subject,Assign3,Assign4
0,St01,Mat,72,78
1,St02,Mat,56,89
2,St01,Phy,85,56
3,St02,Phy,96,88


In [88]:
#left join
pd.merge(df_left,df_right,on=['Student','Subject'] , how = 'left')

Unnamed: 0,Student,Subject,Assign1,Assign2,Assign3,Assign4
0,St01,Mat,54,66,72.0,78.0
1,St02,Phy,63,65,96.0,88.0
2,St03,Phy,56,75,,
3,St01,Phy,78,85,85.0,56.0


In [89]:
#right join
pd.merge(df_left, df_right, on=['Student', 'Subject'], how = 'right')

Unnamed: 0,Student,Subject,Assign1,Assign2,Assign3,Assign4
0,St01,Mat,54.0,66.0,72,78
1,St02,Phy,63.0,65.0,96,88
2,St01,Phy,78.0,85.0,85,56
3,St02,Mat,,,56,89


In [90]:
#inner join
pd.merge(df_left, df_right, on=['Student', 'Subject'], how = 'inner')

Unnamed: 0,Student,Subject,Assign1,Assign2,Assign3,Assign4
0,St01,Mat,54,66,72,78
1,St02,Phy,63,65,96,88
2,St01,Phy,78,85,85,56


In [91]:
#outer join
pd.merge(df_left, df_right, on=['Student', 'Subject'], how = 'outer')

Unnamed: 0,Student,Subject,Assign1,Assign2,Assign3,Assign4
0,St01,Mat,54.0,66.0,72.0,78.0
1,St02,Phy,63.0,65.0,96.0,88.0
2,St03,Phy,56.0,75.0,,
3,St01,Phy,78.0,85.0,85.0,56.0
4,St02,Mat,,,56.0,89.0
