# 1) Pandas handling missing values

- chybajuce udaje su reprezentovane **NaN (not a number)**


## 1.1) Remove rows containing missing values

- **dropna()**


In [1]:
import pandas as pd
import numpy as np

# create a dataframe with missing values
data = {
    "A": [1, 2, np.nan, 4, 5],
    "B": [np.nan, 2, 3, 4, 5],
    "C": [1, 2, 3, np.nan, 5],
    "D": [1, 2, 3, 4, 5],
}
df = pd.DataFrame(data)

# remove rows with missing values
df.dropna(inplace=True)

print(df)

     A    B    C  D
1  2.0  2.0  2.0  2
4  5.0  5.0  5.0  5


## 1.2) Replace missing values

- **fillna()**


In [2]:
import pandas as pd
import numpy as np

# create a dataframe with missing values
data = {
    "A": [1, 2, np.nan, 4, 5],
    "B": [np.nan, 2, 3, 4, 5],
    "C": [1, 2, 3, np.nan, 5],
    "D": [1, 2, 3, 4, 5],
}
df = pd.DataFrame(data)

# replace missing values with 0
df.fillna(value=0, inplace=True)

print(df)

     A    B    C  D
0  1.0  0.0  1.0  1
1  2.0  2.0  2.0  2
2  0.0  3.0  3.0  3
3  4.0  4.0  0.0  4
4  5.0  5.0  5.0  5


## 1.3) Replace missing values with aggregation functions (mean, median, mode)


In [3]:
import pandas as pd
import numpy as np

# create a dataframe with missing values
data = {
    "A": [1, 2, np.nan, 4, 5],
    "B": [np.nan, 2, 3, 4, 5],
    "C": [1, 2, 3, np.nan, 5],
    "D": [1, 2, 3, 4, 5],
}
df = pd.DataFrame(data)

# replace missing values with mean
df["A"].fillna(value=df["A"].mean(), inplace=True)

# replace missing values with median
df["B"].fillna(value=df["B"].median(), inplace=True)

# replace missing values with mode
df["C"].fillna(value=df["C"].mode()[0], inplace=True)

print(df)

     A    B    C  D
0  1.0  3.5  1.0  1
1  2.0  2.0  2.0  2
2  3.0  3.0  3.0  3
3  4.0  4.0  1.0  4
4  5.0  5.0  5.0  5


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["A"].fillna(value=df["A"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["B"].fillna(value=df["B"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

## 1.4) Replace values using another dataframe

- **fillna()**


In [4]:
import pandas as pd
import numpy as np

# create a dataframe with missing values
data1 = {
    "A": [1, 2, np.nan, 4, 5],
    "B": [np.nan, 2, 3, 4, 5],
    "C": [1, 2, 3, np.nan, 5],
    "D": [1, 2, 3, 4, 5],
}
df1 = pd.DataFrame(data1)

# create datframe to fill the missing values with
data2 = {
    "A": [10, 20, 30, 40, 50],
    "B": [10, 20, 30, 40, 50],
    "C": [10, 20, 30, 40, 50],
    "D": [10, 20, 30, 40, 50],
}
df2 = pd.DataFrame(data2)

# replace missing values
df1.fillna(df2, inplace=True)

print(df1)

      A     B     C  D
0   1.0  10.0   1.0  1
1   2.0   2.0   2.0  2
2  30.0   3.0   3.0  3
3   4.0   4.0  40.0  4
4   5.0   5.0   5.0  5


## 1.5) Remove columns containig only NaN values

- **isnull()**
- **all()**


In [5]:
import pandas as pd
import numpy as np

# create a DataFrame
data = {
    "A": [1, 2, 3, 4],
    "B": [5, 6, np.nan, np.nan],
    "C": [np.nan, np.nan, np.nan, np.nan],
    "D": [9, 10, 11, 12],
}
df = pd.DataFrame(data)

# check which columns contain only NaN values
columns_with_nan = df.columns[df.isnull().all()]

# drop the columns containing only NaN values
df = df.drop(columns=columns_with_nan)

print(df)

   A    B   D
0  1  5.0   9
1  2  6.0  10
2  3  NaN  11
3  4  NaN  12


## 1.6) Remove columns containing NaN values that exceed a certain number


In [6]:
import pandas as pd
import numpy as np

# create a DataFrame
data = {
    "A": [1, 2, 3, np.nan],
    "B": [5, 6, np.nan, np.nan],
    "C": [np.nan, np.nan, np.nan, 7],
    "D": [9, 10, 11, 12],
}
df = pd.DataFrame(data)

# set the threshold for the maximum number of NaN values allowed
threshold = 2

# calculate the number of NaN values in each column
nan_counts = df.isnull().sum()

# remove columns that have more NaN values than the threshold
columns_to_drop = nan_counts[nan_counts > threshold].index
df = df.drop(columns=columns_to_drop)

print(df)

     A    B   D
0  1.0  5.0   9
1  2.0  6.0  10
2  3.0  NaN  11
3  NaN  NaN  12
