# Handling missing values
- isnull(), dropna(), and fillna()

In [1]:
import numpy as np 
import pandas as pd

In [2]:
np.random.seed(seed=1)
df = np.random.choice([1, 2, 3], size=7)
df

array([2, 1, 1, 2, 2, 1, 1])

In [3]:
np.random.seed(seed=1)
df = pd.DataFrame(data= {"A": np.random.choice([1, 2, 3, np.nan], size= 3),
                         "B": np.random.choice([80, np.nan], size= 3),
                         "C": np.random.choice([100, 500], size= 3)

})
df

Unnamed: 0,A,B,C
0,2.0,80.0,500
1,,,500
2,1.0,,500


In [4]:
df_2 = pd.DataFrame(data= df.isnull().any(axis=0))
df_2

Unnamed: 0,0
A,True
B,True
C,False


In [5]:
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [6]:
df.dropna()

Unnamed: 0,A,B,C
0,2.0,80.0,500


In [7]:
df

Unnamed: 0,A,B,C
0,2.0,80.0,500
1,,,500
2,1.0,,500


## .dropna()

In [8]:
np.random.seed(1)
df = pd.DataFrame(data= {"A": np.random.choice([45, 99, np.nan], size = 4),
                         "B": np.random.choice([25, np.nan], size= 4),
                         "C": np.random.choice([55, 100], size=4)
    })
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [9]:
df.isnull()

Unnamed: 0,A,B,C
0,False,True,False
1,False,True,False
2,False,True,False
3,False,False,False


In [10]:
df.isnull().sum()

A    0
B    3
C    0
dtype: int64

In [11]:
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [12]:
df.dropna().sum()

A     99.0
B     25.0
C    100.0
dtype: float64

In [13]:
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [14]:
df.dropna(axis=1)
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [15]:
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [16]:
df.dropna(thresh=3)

Unnamed: 0,A,B,C
3,99.0,25.0,100


In [17]:
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [18]:
df.dropna(subset=["B"])

Unnamed: 0,A,B,C
3,99.0,25.0,100


In [19]:
df

Unnamed: 0,A,B,C
0,99.0,,55
1,45.0,,100
2,45.0,,55
3,99.0,25.0,100


In [20]:
df.shape

(4, 3)

In [21]:
df.fillna(value="Constant_Value")

Unnamed: 0,A,B,C
0,99.0,Constant_Value,55
1,45.0,Constant_Value,100
2,45.0,Constant_Value,55
3,99.0,25.0,100


In [22]:
df.fillna(value="Constant_value").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       4 non-null      object 
 2   C       4 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 228.0+ bytes


In [23]:
df.fillna(value=df.median())

Unnamed: 0,A,B,C
0,99.0,25.0,55
1,45.0,25.0,100
2,45.0,25.0,55
3,99.0,25.0,100


In [24]:
df["B"].fillna(value=df["A"].mean(), inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["B"].fillna(value=df["A"].mean(), inplace=True)


Unnamed: 0,A,B,C
0,99.0,72.0,55
1,45.0,72.0,100
2,45.0,72.0,55
3,99.0,25.0,100


In [25]:
df ["B"] = df.fillna(["C"].mean())
df

AttributeError: 'list' object has no attribute 'mean'

 - Task: Use the mean level to fill in the missing data for '`CPI`' and fill using `558480` for the '`GDP (£ m)`' variable. Do your exercise in 2 lines of code, using `inplace=True` when appropriate. Your third line should be `df_practice`, so you can check a snapshot of your DataFrame and see, particularly on rows with index 80-83, where both variables we filled in

In [None]:
df = pd.read_excel("Copy of UK MacroData.xlsx").iloc[76:88]
df

Unnamed: 0,Date,GDP (£ m),CPI,Bank Rate,Gross Fixed Capital Formation (Investments)
76,2019 Q1,555141,1.8,0.75,102105.0
77,2019 Q2,556932,2.0,0.75,101275.0
78,2019 Q3,560987,1.8,0.75,102100.0
79,2019 Q4,560861,1.4,0.75,100943.0
80,2020 Q1,545597,1.7,0.175,97191.0
81,2020 Q2,434718,0.8,0.1,78563.0
82,2020 Q3,507643,0.8,0.1,90756.0
83,2020 Q4,514531,0.8,0.1,96147.0
84,2021 Q1,509261,0.9,0.1,93777.0
85,2021 Q2,546579,2.1,0.1,98872.0


In [None]:
df.isnull().sum()

Date                                           0
GDP (£ m)                                      0
CPI                                            0
Bank Rate                                      0
Gross Fixed Capital Formation (Investments)    0
dtype: int64

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/Code-Institute-Solutions/CSV-datasets/refs/heads/main/UKMacroData.csv").iloc[76:88]
df

Unnamed: 0,Date,GDP (£ m),CPI,Bank Rate,Gross Fixed Capital Formation (Investments)
76,2019 Q1,555141,1.8,0.75,102105.0
77,2019 Q2,556932,2.0,0.75,101275.0
78,2019 Q3,560987,1.8,0.75,102100.0
79,2019 Q4,560861,1.4,0.75,100943.0
80,2020 Q1,545597,1.7,0.175,97191.0
81,2020 Q2,434718,0.8,0.1,78563.0
82,2020 Q3,507643,0.8,0.1,90756.0
83,2020 Q4,514531,0.8,0.1,96147.0
84,2021 Q1,509261,0.9,0.1,93777.0
85,2021 Q2,546579,2.1,0.1,98872.0


In [None]:
df.isnull().sum()

Date                                           0
GDP (£ m)                                      0
CPI                                            0
Bank Rate                                      0
Gross Fixed Capital Formation (Investments)    0
dtype: int64