In [1]:
import pandas as pd 
import numpy as np

In [3]:
data = {
    'The Who': [np.nan, 25000, np.nan, 42000],  # Large arena/stadium sales
    'All Them Witches': [np.nan, 1500, 2200, 2800],  # Theater/club-level sales
    'Goose': [3200, 5000, 7200, 8800]  # Large theaters and small arenas
}
df = pd.DataFrame(data)
df 

Unnamed: 0,The Who,All Them Witches,Goose
0,,,3200
1,25000.0,1500.0,5000
2,,2200.0,7200
3,42000.0,2800.0,8800


In [4]:
# If a value is null, it shows up as True. If there is a value, it shows up as False.
df['The Who'].isnull()

0     True
1    False
2     True
3    False
Name: The Who, dtype: bool

In [5]:
df.isnull()

Unnamed: 0,The Who,All Them Witches,Goose
0,True,True,False
1,False,False,False
2,True,False,False
3,False,False,False


In [6]:
df['The Who'].isna()

0     True
1    False
2     True
3    False
Name: The Who, dtype: bool

In [7]:
df.isna()

Unnamed: 0,The Who,All Them Witches,Goose
0,True,True,False
1,False,False,False
2,True,False,False
3,False,False,False


In [None]:
#  If a value is null, it shows up as False. If a value is not null it shows up as true.
df.notna()

Unnamed: 0,The Who,All Them Witches,Goose
0,False,False,True
1,True,True,True
2,False,True,True
3,True,True,True


In [9]:
# Count of missing values per column
df.isnull().sum()

The Who             2
All Them Witches    1
Goose               0
dtype: int64

In [10]:
df['The Who'].isnull().sum()

np.int64(2)

In [11]:
# info() to find Non-Null Count
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   The Who           2 non-null      float64
 1   All Them Witches  3 non-null      float64
 2   Goose             4 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 228.0 bytes


In [12]:
# size vs count
# Size includes null values
# whereas 
# count doesn’t include null values. 
df['The Who'].size

4

In [13]:
df['The Who'].count()

np.int64(2)

In [14]:
# filter on null values
df[df['All Them Witches'].notna()]

Unnamed: 0,The Who,All Them Witches,Goose
1,25000.0,1500.0,5000
2,,2200.0,7200
3,42000.0,2800.0,8800


In [15]:
#  Filter Null Multiple Conditions
df[(df['The Who'].isna()) & (df['All Them Witches'].notna())]

Unnamed: 0,The Who,All Them Witches,Goose
2,,2200.0,7200


In [16]:
df[(df['The Who'].isna()) | (df['Goose'] < 5000)]

Unnamed: 0,The Who,All Them Witches,Goose
0,,,3200
2,,2200.0,7200


In [17]:
# sorting With Null Values
df.sort_values(by='The Who', na_position='first')

Unnamed: 0,The Who,All Them Witches,Goose
0,,,3200
2,,2200.0,7200
1,25000.0,1500.0,5000
3,42000.0,2800.0,8800


In [18]:
df.sort_values(by='The Who', ascending=False, na_position='last')

Unnamed: 0,The Who,All Them Witches,Goose
3,42000.0,2800.0,8800
1,25000.0,1500.0,5000
0,,,3200
2,,2200.0,7200


In [20]:
# Remove Row Null Value with dropna()
df_remove_row = df.dropna()
df_remove_row

Unnamed: 0,The Who,All Them Witches,Goose
1,25000.0,1500.0,5000
3,42000.0,2800.0,8800


In [22]:
# Remove Row where there are 2 null values
df_remove_row_2 = df.dropna(thresh=2)
# By using the thresh parameter you can specify how many null values are needed before dropping a row.
df_remove_row_2


Unnamed: 0,The Who,All Them Witches,Goose
1,25000.0,1500.0,5000
2,,2200.0,7200
3,42000.0,2800.0,8800


In [23]:
#  Remove Column with null values
df_remove_column = df.dropna(axis=1)
df_remove_column

Unnamed: 0,Goose
0,3200
1,5000
2,7200
3,8800


In [25]:
#  Fill Null Values with an Integer
df_filled = df.fillna(0)
df_filled


Unnamed: 0,The Who,All Them Witches,Goose
0,0.0,0.0,3200
1,25000.0,1500.0,5000
2,0.0,2200.0,7200
3,42000.0,2800.0,8800


In [26]:
# Fill Null Values With Strings
venue_list = {'St Augustine Amp': ['Billy Strings', np.nan, 'The Smile', 'Gojira', 'Mastodon'],
              'Hard Rock Orlando': ['City and Colour', 'Toto', 'Mastodon', np.nan, 'Dream Theater']}
venue_df = pd.DataFrame(venue_list)
venue_df

Unnamed: 0,St Augustine Amp,Hard Rock Orlando
0,Billy Strings,City and Colour
1,,Toto
2,The Smile,Mastodon
3,Gojira,
4,Mastodon,Dream Theater


In [28]:
df_filled2 = venue_df.fillna('Unknown Band')
df_filled2

Unnamed: 0,St Augustine Amp,Hard Rock Orlando
0,Billy Strings,City and Colour
1,Unknown Band,Toto
2,The Smile,Mastodon
3,Gojira,Unknown Band
4,Mastodon,Dream Theater


In [None]:
# Forward Fill
# Forward Fill will populate the null value with the previous row in the datafram
df_ffill = df.ffill()
df_ffill

Unnamed: 0,The Who,All Them Witches,Goose
0,,,3200
1,25000.0,1500.0,5000
2,25000.0,2200.0,7200
3,42000.0,2800.0,8800


In [30]:
# Backwards Fill
# Backwards Fill will populate the null value with next row in the dataframe.
df_bfill = df.bfill()
df_bfill

Unnamed: 0,The Who,All Them Witches,Goose
0,25000.0,1500.0,3200
1,25000.0,1500.0,5000
2,42000.0,2200.0,7200
3,42000.0,2800.0,8800


In [31]:
# GroupBy With Forward Fill
df_baseball = pd.DataFrame({
    'team': ['Yankees', 'Yankees', 'Dodgers', 'Dodgers', 'Dodgers'],
    'batting_avg': [0.310, np.nan, 0.280, np.nan, 0.290]
})

df_baseball['batting_avg'] = df_baseball.groupby('team')['batting_avg'].ffill()

df_baseball

Unnamed: 0,team,batting_avg
0,Yankees,0.31
1,Yankees,0.31
2,Dodgers,0.28
3,Dodgers,0.28
4,Dodgers,0.29


In [33]:
# Fill With Statistics
df_fill_na = df.copy()

df_fill_na['The Who'].mean()

df_fill_na['The Who'] = df_fill_na['The Who'].fillna(df_fill_na['The Who'].mean())

df_fill_na['All Them Witches'] = df_fill_na['All Them Witches'].fillna(df_fill_na['All Them Witches'].median())

df_fill_na

Unnamed: 0,The Who,All Them Witches,Goose
0,33500.0,2200.0,3200
1,25000.0,1500.0,5000
2,33500.0,2200.0,7200
3,42000.0,2800.0,8800


In [35]:
# Interpolation
df_interpolated = df.interpolate(method='linear', axis=0)
df_interpolated


Unnamed: 0,The Who,All Them Witches,Goose
0,,,3200
1,25000.0,1500.0,5000
2,33500.0,2200.0,7200
3,42000.0,2800.0,8800


In [37]:
# interpolation with backwards fill
df_interpolated_2 = df.interpolate(method='linear', axis=0).bfill()
df_interpolated_2

Unnamed: 0,The Who,All Them Witches,Goose
0,25000.0,1500.0,3200
1,25000.0,1500.0,5000
2,33500.0,2200.0,7200
3,42000.0,2800.0,8800


In [39]:
# Replace Function
df_replaced_1 = df.replace(np.nan, 0)
df_replaced_1


Unnamed: 0,The Who,All Them Witches,Goose
0,0.0,0.0,3200
1,25000.0,1500.0,5000
2,0.0,2200.0,7200
3,42000.0,2800.0,8800


In [40]:
df_replaced_2 = df.replace({np.nan: 0, 5000: 5500})
df_replaced_2

Unnamed: 0,The Who,All Them Witches,Goose
0,0.0,0.0,3200
1,25000.0,1500.0,5500
2,0.0,2200.0,7200
3,42000.0,2800.0,8800


In [None]:
# Mask
# by using mask we can pass condition
df_masked_1 = df.mask(df.isnull(), 0)
df_masked_1

Unnamed: 0,The Who,All Them Witches,Goose
0,0.0,0.0,3200
1,25000.0,1500.0,5000
2,0.0,2200.0,7200
3,42000.0,2800.0,8800


In [42]:
df_masked_2 = df.mask(df < 0, np.nan)
df_masked_2

Unnamed: 0,The Who,All Them Witches,Goose
0,,,3200
1,25000.0,1500.0,5000
2,,2200.0,7200
3,42000.0,2800.0,8800


In [43]:
# Where
df_where_1 = df.where(df.notnull(), 0)
df_where_1

Unnamed: 0,The Who,All Them Witches,Goose
0,0.0,0.0,3200
1,25000.0,1500.0,5000
2,0.0,2200.0,7200
3,42000.0,2800.0,8800


In [None]:
# Custom Logic With Fillna
# df_logic['The Who'] = 
# df_logic['The Who'].fillna(df_logic['The Who'].median() if df_logic['The Who'].median() > 500 else 0)