# Pandas

* Manipulate & analyse ***tubular*** data
    - tubular data: excel, csv, table-like
* Data stuctures: ***DataFrame*** & ***Series***
    - DataFrame: 2-dimensional
    - Series: 1-dimesional
    - Series can be extracted from DataFrame, also can be added to DataFrame

---

##### Import Packages


In [28]:
import pandas as pd
import numpy as np  #analysis always applys some functions from numpy

#!pip install pandas   #if you are in console, not in jupyter
pd.__version__  #check versions as functions insides may be different 

'1.3.4'

# 1. Series
* 1-Dimensional(1D)-list structure
* with ***index*** but no column
* a column in a DataFrame

In [29]:
# Create a Series using a list
ser = pd.Series(['red', 'blue', 'green', 'yellow'])

# type(ser)
ser

#if no index is provided, will be normal index.

0       red
1      blue
2     green
3    yellow
dtype: object

In [30]:
# Create a Series using a list with pre-set index
ser = pd.Series(['red', 'blue', 'green', 'yellow'], index=['a', 'b', 'c', 'd'])

# type(ser)
ser

a       red
b      blue
c     green
d    yellow
dtype: object

In [31]:
ser.ndim 
#check the number of dimension in Series


1

In [32]:
ser.shape
#check the length of series
#the (4,' '),' 'will have number iif this is two dimensions

(4,)

In [33]:
ser.size # row * column,  4 * 1 in this example



4

In [34]:
ser = pd.Series(['red', 'blue', 'green', 'yellow'])

ser.index

#check the index of Series

RangeIndex(start=0, stop=4, step=1)

In [35]:
ser = pd.Series(['red', 'blue', 'green', 'yellow'], index=['a', 'b', 'c', 'd'])

ser.index

#check the index of Series

Index(['a', 'b', 'c', 'd'], dtype='object')

In [36]:
ser.describe()

#check the statistics

count       4
unique      4
top       red
freq        1
dtype: object

In [37]:
ser.value_counts() # value_counts


red       1
blue      1
green     1
yellow    1
dtype: int64

In [38]:
ser[3] # position


'yellow'

In [39]:
ser[1:] # slice

b      blue
c     green
d    yellow
dtype: object

# 2. DataFrame 

* 2D-table structure
* most frequently used stucture in pandas
* with ***columns*** & ***index***
        index ~ row no.
* index could be number or string
* Each Column = 1 Series

In [83]:
# creating dataframe (using dictionary)

df = pd.DataFrame({'name': ['Paul', 'John', 'Mary', 'Jane'],
                   'phone': ['92435678', '90909090', '09090909', '12435678'],
                   'gender': ['male', 'male', 'female', 'female'],
                   'age': [18, 22, 23, np.nan], #np.nan ~ None  ,  nan means not a number
                   'colA': ser},   #adding the series into DataFrame
                  index=['a', 'b', 'c', 'd'])

#dict = DataFrame
#keys = Columns name
#list = Columns value



df

Unnamed: 0,name,phone,gender,age,colA
a,Paul,92435678,male,18.0,red
b,John,90909090,male,22.0,blue
c,Mary,9090909,female,23.0,green
d,Jane,12435678,female,,yellow


In [41]:
type(df)


pandas.core.frame.DataFrame

In [42]:
df.ndim


2

In [43]:
df.shape


(4, 5)

In [44]:
df.size # row x col


20

In [45]:
df.columns


Index(['name', 'phone', 'gender', 'age', 'colA'], dtype='object')

In [46]:
df.index


Index(['a', 'b', 'c', 'd'], dtype='object')

In [47]:
df.set_index('name') # changing index to other column


Unnamed: 0_level_0,phone,gender,age,colA
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Paul,92435678,male,18.0,red
John,90909090,male,22.0,blue
Mary,9090909,female,23.0,green
Jane,12435678,female,,yellow


In [48]:
#To update df index:

#Method 1
#df.set_index('name', inplace=True) # changing index to other column

#Method 2
df = df.set_index('name')

In [49]:
df.set_index?
#if you don't know how to use set_index, add a '?' after

In [50]:
df.info()

#Check the number of non-null value(nan)

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Paul to Jane
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   phone   4 non-null      object 
 1   gender  4 non-null      object 
 2   age     3 non-null      float64
 3   colA    4 non-null      object 
dtypes: float64(1), object(3)
memory usage: 160.0+ bytes


In [51]:
df.describe()
#check all which can calculate count,mean,...


Unnamed: 0,age
count,3.0
mean,21.0
std,2.645751
min,18.0
25%,20.0
50%,22.0
75%,22.5
max,23.0


In [52]:
df.count()
#count non-null value(非空值)

phone     4
gender    4
age       3
colA      4
dtype: int64

In [53]:
df.shape[0]
#有幾多行

4

## <font color=red>Column</font>

to check the available function of the dataframe

type the name of a dataframe e.g.

df.         ->click tab

In [85]:
#call out the series (a single column), method 1
df.name # dot notation  
#if the name of the column = pre-set function,then cannot use this method

#'name' is the name of the column

a    Paul
b    John
c    Mary
d    Jane
Name: name, dtype: object

In [86]:
#call out the series (a single column), method 2 , better
df['name'] # index


a    Paul
b    John
c    Mary
d    Jane
Name: name, dtype: object

In [88]:
#type will be series
type(df['name']) # index


pandas.core.series.Series

In [87]:
df[['name','age']]  # select multiple column (DF), fancy index


Unnamed: 0,name,age
a,Paul,18.0
b,John,22.0
c,Mary,23.0
d,Jane,


In [89]:
type(df[['name','age']])  # because more than 1 series


pandas.core.frame.DataFrame

In [95]:
df['color'] = ser # add a column


In [60]:
#dot notation can be use as replace but not create a new column
#e.g.
df.color3 = ser

  df.color3 = ser


In [61]:
df.assign(color2=ser) # add a column using assign function



Unnamed: 0_level_0,phone,gender,age,colA,color,color2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Paul,92435678,male,18.0,red,,
John,90909090,male,22.0,blue,,
Mary,9090909,female,23.0,green,,
Jane,12435678,female,,yellow,,


In [62]:
df

#just use assign will not save

Unnamed: 0_level_0,phone,gender,age,colA,color
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Paul,92435678,male,18.0,red,
John,90909090,male,22.0,blue,
Mary,9090909,female,23.0,green,
Jane,12435678,female,,yellow,


In [104]:
df = df.assign(color2=ser) # save the assign



In [64]:
df


Unnamed: 0_level_0,phone,gender,age,colA,color,color2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Paul,92435678,male,18.0,red,,
John,90909090,male,22.0,blue,,
Mary,9090909,female,23.0,green,,
Jane,12435678,female,,yellow,,


In [66]:
#filter() - filter(column name /index name)
#df.filter(items=['name']) or df.filter(['name']) 
df.filter(['name']) # filter by columns for axis (dataframe) <-default

Paul
John
Mary
Jane


In [67]:
df.filter(['name','age'], axis='columns') # filter by columns (dataframe)



Unnamed: 0_level_0,age
name,Unnamed: 1_level_1
Paul,18.0
John,22.0
Mary,23.0
Jane,


In [105]:
df.filter(like='col')
#like= 'col'  means all column names contains 'col' this str

Unnamed: 0,colA,color,color2
a,red,red,red
b,blue,blue,blue
c,green,green,green
d,yellow,yellow,yellow


In [106]:
df.filter(regex='r$')  
#column name filter by regular expression(regex) end with 'r'

Unnamed: 0,gender,color
a,male,red
b,male,blue
c,female,green
d,female,yellow


In [107]:
df.filter(items=['a','b'],axis='index')

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,20.0,red,red,red
b,John,90909090,male,24.0,blue,blue,blue


In [91]:
df.values
#type: numpy.ndarray

array([['Paul', '92435678', 'male', 18.0, 'red'],
       ['John', '90909090', 'male', 22.0, 'blue'],
       ['Mary', '09090909', 'female', 23.0, 'green'],
       ['Jane', '12435678', 'female', nan, 'yellow']], dtype=object)

In [96]:
df['color'][1] # series[pos]


'blue'

In [71]:
df.age.mean()


21.0

In [72]:
df.age.median()


22.0

In [73]:
df.age.max() # get max item


23.0

In [74]:
df.age.idxmax() # get index value of max item


'Mary'

In [100]:
df.age = df.age + 1 
# NaN will not add 1


In [101]:
df


Unnamed: 0,name,phone,gender,age,colA,color
a,Paul,92435678,male,20.0,red,red
b,John,90909090,male,24.0,blue,blue
c,Mary,9090909,female,25.0,green,green
d,Jane,12435678,female,,yellow,yellow


In [102]:
df.gender.value_counts()
# ~group by

male      2
female    2
Name: gender, dtype: int64

In [78]:
-(df.age)



name
Paul   -19.0
John   -23.0
Mary   -24.0
Jane     NaN
Name: age, dtype: float64

In [98]:
df.name.str.upper()

a    PAUL
b    JOHN
c    MARY
d    JANE
Name: name, dtype: object

# Update data

In [111]:
df.color2='unknown' # update can use dot notation but add new column cannot
df

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,20.0,red,red,unknown
b,John,90909090,male,24.0,blue,blue,unknown
c,Mary,9090909,female,25.0,green,green,unknown
d,Jane,12435678,female,,yellow,yellow,unknown


In [113]:
df['color2']='-'
df

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,20.0,red,red,-
b,John,90909090,male,24.0,blue,blue,-
c,Mary,9090909,female,25.0,green,green,-
d,Jane,12435678,female,,yellow,yellow,-


In [115]:
df['color2']=df['color']
df
#can use column to update a column

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,20.0,red,red,red
b,John,90909090,male,24.0,blue,blue,blue
c,Mary,9090909,female,25.0,green,green,green
d,Jane,12435678,female,,yellow,yellow,yellow


In [118]:
df['color2']=df['color']+' color' #can append string
df

Unnamed: 0,name,phone,gender,age,colA,color,color2
a,Paul,92435678,male,20.0,red,red,red color
b,John,90909090,male,24.0,blue,blue,blue color
c,Mary,9090909,female,25.0,green,green,green color
d,Jane,12435678,female,,yellow,yellow,yellow color


In [119]:
df['colB']=df['colA'] * 2 #Update each item of colB
df


Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,Paul,92435678,male,20.0,red,red,red color,redred
b,John,90909090,male,24.0,blue,blue,blue color,blueblue
c,Mary,9090909,female,25.0,green,green,green color,greengreen
d,Jane,12435678,female,,yellow,yellow,yellow color,yellowyellow


In [127]:
df.colB = df.colA *2
df

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,Paul,92435678,male,20.0,red,red,red color,redred
b,John,90909090,male,24.0,blue,blue,blue color,blueblue
c,Mary,9090909,female,25.0,green,green,green color,greengreen
d,Jane,12435678,female,,yellow,yellow,yellow color,yellowyellow


In [122]:
df.colB[1]
#this can get the value, but you cannot assign value to it
# as column b is a series

'blueblue'

In [125]:
df.colB[1] = 1
#cannot assign

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.colB[1] = 1


In [135]:
#no column name, no index name
tmpDf = df.copy()  #create a new copy of dataframe
tmpDf

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,Paul,92435678,male,20.0,red,red,red color,redred
b,John,90909090,male,24.0,blue,blue,blue color,blueblue
c,Mary,9090909,female,25.0,green,green,green color,greengreen
d,Jane,12435678,female,,yellow,yellow,yellow color,yellowyellow


In [136]:
#Use fancy index to get a view of Df
tmpDf[['phone','gender']] = 'X' 
#it will also update data of the original dataframe
tmpDf

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,Paul,X,X,20.0,red,red,red color,redred
b,John,X,X,24.0,blue,blue,blue color,blueblue
c,Mary,X,X,25.0,green,green,green color,greengreen
d,Jane,X,X,,yellow,yellow,yellow color,yellowyellow


In [137]:
# Use loc indexer (location/label based) to update data
# Syntax: df.loc[row(index),col]   #MUST USE LABEL
tmpDf.loc[:, ['colA','color2']] = 'red' # : means all rows
tmpDf

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,Paul,X,X,20.0,red,red,red,redred
b,John,X,X,24.0,red,blue,red,blueblue
c,Mary,X,X,25.0,red,green,red,greengreen
d,Jane,X,X,,red,yellow,red,yellowyellow


In [138]:
#Use iloc indexer (position based) to update data
# Syntax: df.iloc[row(index),col]  #MUST USE POSITION
tmpDf.iloc[:, 4]= 'blue'
tmpDf

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,Paul,X,X,20.0,blue,red,red,redred
b,John,X,X,24.0,blue,blue,red,blueblue
c,Mary,X,X,25.0,blue,green,red,greengreen
d,Jane,X,X,,blue,yellow,red,yellowyellow


In [None]:
# Why they can do it? they return series or dataframe

## Column.str functions

- https://padas.pydata.org/pandas-docs/stable/user_guide/text.hktml#string-methods


In [139]:
df.name.str.upper()


a    PAUL
b    JOHN
c    MARY
d    JANE
Name: name, dtype: object

In [140]:
df.gender.str.count('e')  #letter e occurance

a    1
b    1
c    2
d    2
Name: gender, dtype: int64

In [141]:
df.phone.str.count('^90')  #Start with 90

a    0
b    1
c    0
d    0
Name: phone, dtype: int64

In [142]:
df.gender.str.len()

a    4
b    4
c    6
d    6
Name: gender, dtype: int64

In [144]:
df.name.str.split('a') #return a series

a    [P, ul]
b     [John]
c    [M, ry]
d    [J, ne]
Name: name, dtype: object

In [145]:
df.name.str.split('a', expand=True) #expanded to a data frame

Unnamed: 0,0,1
a,P,ul
b,John,
c,M,ry
d,J,ne


In [146]:
df.name.str.contains('^P')

a     True
b    False
c    False
d    False
Name: name, dtype: bool

In [147]:
df.name.str.startswith('P')

a     True
b    False
c    False
d    False
Name: name, dtype: bool

In [148]:
df.name.str.endswith('e')

a    False
b    False
c    False
d     True
Name: name, dtype: bool

In [153]:
df.phone.str.replace('90','Ninty')

a                92435678
b    NintyNintyNintyNinty
c       0NintyNintyNinty9
d                12435678
Name: phone, dtype: object

In [151]:
# most are same as python's native string functions
'90909090'.replace('90','Ninty')


'NintyNintyNintyNinty'

In [152]:
'90909090'.count('90')

4

In [160]:
# not str method
import re
re.findall('^9','90909090')

['9']

In [157]:
#str.findall(re) - return a series
df.phone.str.findall('^9')


a    [9]
b    [9]
c     []
d     []
Name: phone, dtype: object

In [158]:
df.phone.str.findall('90')


a                  []
b    [90, 90, 90, 90]
c        [90, 90, 90]
d                  []
Name: phone, dtype: object

In [159]:
df.phone.str.findall('\d\d\d\d') # 4-digit group

a    [9243, 5678]
b    [9090, 9090]
c    [0909, 0909]
d    [1243, 5678]
Name: phone, dtype: object

## Replace data - replace()
- series
- data frame

### Series

In [162]:
df.name.replace('Paul','Mr. Chan',inplace=False)
#replace a string (exact match)


a    Mr. Chan
b        John
c        Mary
d        Jane
Name: name, dtype: object

In [166]:
df.gender.replace({'male':'M','female':'F'})
#replace wih value in dictionary

a    M
b    M
c    F
d    F
Name: gender, dtype: object

In [167]:
df.name.replace(['Paul','John'], '****')

a    ****
b    ****
c    Mary
d    Jane
Name: name, dtype: object

In [168]:
df.name.replace('a','@', regex=True) 
#regular expression -series example

a    P@ul
b    John
c    M@ry
d    J@ne
Name: name, dtype: object

In [169]:
df.name.str.replace('a','@') 
#str.replace equivalent

a    P@ul
b    John
c    M@ry
d    J@ne
Name: name, dtype: object

In [170]:
df.name.replace('^Pa','Mr. P@', regex=True) #regular expression

a    Mr. P@ul
b        John
c        Mary
d        Jane
Name: name, dtype: object

In [172]:
df.name.replace(regex='^Pa',value='Mr. P@') 
#regular expression = to replace

a    Mr. P@ul
b        John
c        Mary
d        Jane
Name: name, dtype: object

In [173]:
df.name.replace(regex={'^Pa':'Mr. P@', 'ry':'rY'}) 
#regular expression = dict with regex & value

a    Mr. P@ul
b        John
c        MarY
d        Jane
Name: name, dtype: object

In [174]:
df,phone.replace(92435678, 99998888) 
#Not working , df.phone.dtype is Object

NameError: name 'phone' is not defined

In [176]:
df.phone.astype(int).replace(92435678, 99998888) 

a    99998888
b    90909090
c     9090909
d    12435678
Name: phone, dtype: int32

In [177]:
df.age.replace(np.nan, df.age.mean())
#can also use fillna(0)

a    20.0
b    24.0
c    25.0
d    23.0
Name: age, dtype: float64

### DataFrame - replace()

In [178]:
df.replace('a','@', regex=True) #regular expression

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,P@ul,92435678,m@le,20.0,red,red,red color,redred
b,John,90909090,m@le,24.0,blue,blue,blue color,blueblue
c,M@ry,9090909,fem@le,25.0,green,green,green color,greengreen
d,J@ne,12435678,fem@le,,yellow,yellow,yellow color,yellowyellow


In [179]:
df.replace(['Paul','John','male'], '****') 
#list example (for all columns)

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,****,92435678,****,20.0,red,red,red color,redred
b,****,90909090,****,24.0,blue,blue,blue color,blueblue
c,Mary,9090909,female,25.0,green,green,green color,greengreen
d,Jane,12435678,female,,yellow,yellow,yellow color,yellowyellow


In [180]:
#key of dictionary is now column name!
df.replace({'name':'Paul','gender':'male'},value='xxxx')
#replace name=X or gender=Y with a value

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,xxxx,92435678,xxxx,20.0,red,red,red color,redred
b,John,90909090,xxxx,24.0,blue,blue,blue color,blueblue
c,Mary,9090909,female,25.0,green,green,green color,greengreen
d,Jane,12435678,female,,yellow,yellow,yellow color,yellowyellow


In [184]:
df.replace({'name':'Paul','gender':'male'},value='xxxx', regex = True)
# Now X and Y are regular expression, 
#can see 'male' inside  'female' is replaced


Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,xxxx,92435678,xxxx,20.0,red,red,red color,redred
b,John,90909090,xxxx,24.0,blue,blue,blue color,blueblue
c,Mary,9090909,fexxxx,25.0,green,green,green color,greengreen
d,Jane,12435678,fexxxx,,yellow,yellow,yellow color,yellowyellow


In [183]:
df.replace({'a':'@','l':'!'},regex=True)
#if value is None / omitted, dict-key is to_replace,
#

Unnamed: 0,name,phone,gender,age,colA,color,color2,colB
a,P@u!,92435678,m@!e,20.0,red,red,red co!or,redred
b,John,90909090,m@!e,24.0,b!ue,b!ue,b!ue co!or,b!ueb!ue
c,M@ry,9090909,fem@!e,25.0,green,green,green co!or,greengreen
d,J@ne,12435678,fem@!e,,ye!!ow,ye!!ow,ye!!ow co!or,ye!!owye!!ow


#### Use of Map function to update Column data

In [150]:
df.name.map(lambda x: 'person-' + x)


a    person-Paul
b    person-John
c    person-Mary
d    person-Jane
Name: name, dtype: object

In [81]:
def changeName(name):
     name = 'person-' + name + '-001'
     return name


In [None]:
df.name = df.name.map(changeName)

###### Drop a row / column

In [None]:
df.drop('d')  # drop a row with a index value


In [None]:
df.drop(['a','d']) # drop multiple columns


In [None]:
df.drop('colA', axis='columns') # drop a column with a column name


In [None]:
df.drop(['phone','colA'], axis=1)


## <font color=red>Rows</font>

In [None]:
df[0] # WARNING: Position is not working!!! (it's for col name)


In [None]:
df['a'] # WARNING: Index value is not working!!! (please use loc indexer)



In [None]:
# slice operation (rows)
df[:] # all rows



In [None]:
df[:2] # row 0 to 1 


In [None]:
df[::-1] # reverse order


In [None]:
# Filter()
# Axis 0: rows / index
# Axis 1: columns
df.filter('a', axis='index')
# df.filter('a',axis=0)
# df.filter(['a','b'],axis=0)


In [None]:
df.filter('a',axis=0)


In [None]:
df.filter(['a','b'],axis=0)


### <font color=blue>**Indexer**</font>

+ loc (label)
+ iloc (position)

#### **df.loc indexer (label)**

In [None]:
# 1. LOC indexer: 
# SYNTAX: df.loc[row, col]

# df.loc['a'] # get 1 row (series)
# df.loc[['a','b']] # get multiple rows
# df.loc[:] # slice
# df.loc['a':'c'] # slice (including c)
# df.loc[['b','b', 'a','a']] # fancy index

# df.loc[:, 'name']
# df.loc[:, ['name', 'age']] # all rows, with name & age column

# BE CAREFUL!!!
# if index is not numbers, you cannot use df.loc[0]
# df.loc[0] # KeyError: no such index value


#### **df.iloc indexer (position)**

In [None]:
# 1. ILOC indexer: 
# SYNTAX: df.iloc[row, col]

# df.iloc[0] # first row
# df.iloc[:] # slice: all rows
# df.iloc[0:1] # 0 to 1 (not include 1)

# df.iloc[:, :] # all rows, all columns
# df.iloc[:, 0] # all rows, first column
# df.iloc[:, [-1]] # all rows, last column
# df.iloc[:, 1]

# BE CAREFUL!!!
# Since it is based on POSITION, you cannot use col labels
# df.iloc[:, 'age'] # not working
# df.iloc[:, ['age']]  # not working


### <font color=Green>**Boolean Mask**</font>

In [None]:
mask = df.age > 20
# ~mask # negate it

# df[mask] # retrieve the results
df.loc[mask] # retrieve the results

# BE CAREFUL!!!
# df.iloc[mask] # iloc indexer cannot use boolean mask!!!

### <font color=Green>**Query()**</font>

In [None]:
# df.query('age == 22')
# df.query('age > 20')
# df.query('name=="Paul" & age>10')

# Using Python str functions
# df.query('name.str.contains("J")', engine='python')
# df.query('name.str.upper()=="PAUL"', engine='python')

# BE CAREFUL!!!
# df.query('name.str.upper().contains("P")', engine='python') # NOT WORKING
# df.query('name.str.upper().str.contains("P")', engine='python') # WORKING


### Missing Values 

In [None]:
# handing NA values
# which element is null

# df.loc[df.age.isna()]

# df[df.age.isnull()] 
# df[df.age.isna()] 
# df[~df.age.isnull()]  # is not NA
# df[df.age.notnull()]  
# df[df.age.notna()]  

df.age.fillna(df.age.mean(), inplace=True) # fill the values (inplace=True if you want to save it)


## <font color=blue>Groupby</font>

##### * divide some data input groups (e.g. gender)

In [None]:
df

In [None]:
# df.groupby('gender').mean()
# df.groupby('gender').age.mean()
# df.groupby('gender')['age'].mean()

In [None]:
# group by multiple columns
df.groupby(['gender', 'name']).mean()

In [None]:
df.groupby(['gender', 'name']).mean().unstack()

In [None]:
# pivot table function
# df.pivot_table('age', index='gender', columns='name')
df.pivot_table('age', index='gender', columns='name', aggfunc='mean') # aggfunc other than mean


## <font color=blue>Sorting</font>

In [None]:
# df.sort_values(by='age', ascending=False)
df.sort_values(by=['gender', 'age'], ascending=[False, True])