'isnull' or 'isna' can be used to remove blanks in a dataset, that is 'nan'

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame({
    'Col A': [1, np.nan, 7],
    'Col B': [np.nan, 2, 3],
    'Col C': [np.nan, 2, np.nan]
})
df

Unnamed: 0,Col A,Col B,Col C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [5]:
pd.isnull(df)

Unnamed: 0,Col A,Col B,Col C
0,False,True,True
1,True,False,False
2,False,False,True


In [8]:
pd.notnull(df).sum()

Col A    2
Col B    2
Col C    1
dtype: int64

to get the values od not null
for example (df[pd.notnull(df)])

In [12]:
df[pd.notnull(df)]

Unnamed: 0,Col A,Col B,Col C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [None]:
df.isnull() # shorter way

Unnamed: 0,Col A,Col B,Col C
0,False,True,True
1,True,False,False
2,False,False,True


In [15]:
df.dropna()

Unnamed: 0,Col A,Col B,Col C


In [16]:
df[df.notnull()]

Unnamed: 0,Col A,Col B,Col C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [17]:
df.dropna()

Unnamed: 0,Col A,Col B,Col C


for dropna in a dataframe, it drops all the rows that has the na values, while for series, it just drop the one that has na

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Col A   2 non-null      float64
 1   Col B   2 non-null      float64
 2   Col C   1 non-null      float64
dtypes: float64(3)
memory usage: 200.0 bytes


In [20]:
df.dropna(axis = 1)  # to drop a column that has null values


0
1
2


In [21]:
df

Unnamed: 0,Col A,Col B,Col C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [23]:
df.fillna({'Col A':5, 'Col B': 0, 'Col C':6})

Unnamed: 0,Col A,Col B,Col C
0,1.0,0.0,6.0
1,5.0,2.0,2.0
2,7.0,3.0,6.0


In [24]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Col A,Col B,Col C
0,1.0,,
1,1.0,2.0,2.0
2,7.0,3.0,2.0


In [25]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,Col A,Col B,Col C
0,1.0,1.0,1.0
1,,2.0,2.0
2,7.0,3.0,3.0


In [26]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [21, 25, 34, 70, 41]
})
df

Unnamed: 0,Sex,Age
0,M,21
1,F,25
2,F,34
3,D,70
4,?,41


In [27]:
df['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [28]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [29]:
df['Sex'].replace({'D':'F', 'N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [30]:
df.replace({
    'Sex': {
        'D':'F',
        'N':'M'
    },
    'Age': {
        70:27,
    }
})

Unnamed: 0,Sex,Age
0,M,21
1,F,25
2,F,34
3,F,27
4,?,41


In [35]:
df.loc[df['Age'] > 40, 'Age'] = df.loc[df['Age'] > 40, 'Age'] / 10
df

Unnamed: 0,Sex,Age
0,M,21.0
1,F,25.0
2,F,34.0
3,D,7.0
4,?,4.1


In [36]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

duplicate method goes from up to down
it sees the first as original, while the rest are considered duplicate, then using 'keep', you can decide to keep the last


In [37]:
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [38]:
ambassadors.duplicated(keep='last')

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [39]:
ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [40]:
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [41]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [42]:
ambassadors.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

keep = false identifies all duplicates as a duplicate

In [43]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

In [44]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [46]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

for duplicates in a data frame, it checks a row, if its exactly what is in another row, otherwise, you specify using subset

In [48]:
players.duplicated(subset = ['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [49]:
players.duplicated(subset = ['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [50]:
players.duplicated(subset = ['Pos'])

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [51]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [52]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [54]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [55]:
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object