# impute_missing

> Fill in a module description here

In [None]:
#| default_exp impute_missing

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
#| export
from nbdev.showdoc import *
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy.stats import truncnorm

In [None]:
#| export
def impute_proteomics_data(df, conditions):
    def impute_detection_limit(condition_df, detection_limit):
        std_dev = detection_limit * 0.2

        def generate_positive_random_value(mean, std_dev):
            a, b = 0, np.inf
            return truncnorm.rvs(a=(a - mean) / std_dev, b=(b - mean) / std_dev, loc=mean, scale=std_dev)

        imputed_df = condition_df.applymap(lambda x: generate_positive_random_value(detection_limit, std_dev) if pd.isnull(x) else x)
        return imputed_df

    def compute_detection_limit(condition_df):
        smallest_values = []
        for column in condition_df.columns:
            column_data = condition_df[column]
            column_non_zero = column_data[column_data > 0]
            smallest_values.extend(column_non_zero.nsmallest(10).tolist())

        detection_limit = np.median(smallest_values)
        print('fill na with detection_limit:', detection_limit, np.log10(detection_limit), condition_df.columns)
        return detection_limit

    unique_conditions = set(conditions)
    imputed_dfs = []

    detection_limit_imputed_indices = []
    iterative_imputer_imputed_indices = []

    for condition in unique_conditions:
        condition_indices = [i for i, c in enumerate(conditions) if c == condition]
        condition_df = df.iloc[:, condition_indices]

        all_values_missing = condition_df.isnull().all(axis=1)
        missing_rows = condition_df[all_values_missing]
        non_missing_rows = condition_df[~all_values_missing]

        detection_limit = compute_detection_limit(condition_df)

        if not missing_rows.empty:
            imputed_missing_rows = impute_detection_limit(missing_rows, detection_limit)
            detection_limit_imputed_indices.extend(imputed_missing_rows.index.tolist())
        else:
            imputed_missing_rows = missing_rows

        imp_mean = IterativeImputer(random_state=0, imputation_order='roman')

        imputed_data = imp_mean.fit_transform(non_missing_rows)
        imputed_data = np.maximum(imputed_data, detection_limit)

        imputed_non_missing_rows = pd.DataFrame(imputed_data,
                                                index=non_missing_rows.index,
                                                columns=non_missing_rows.columns)

        combined_df = pd.concat([imputed_missing_rows, imputed_non_missing_rows]).sort_index()
        imputed_dfs.append(combined_df)

        # Get indices of rows imputed by IterativeImputer
        imputed_rows_mask = non_missing_rows.isnull().any(axis=1)
        iterative_imputer_imputed_indices.extend(non_missing_rows[imputed_rows_mask].index.tolist())

    imputed_df = pd.concat(imputed_dfs, axis=1).reindex(columns=df.columns)
    return imputed_df, detection_limit_imputed_indices, iterative_imputer_imputed_indices

In [None]:
# Example usage
data = {
    "A1": [1,    2, 3, np.nan, 5,4,4],
    "A2": [2,    3, 4, np.nan, 6,4,np.nan],
    "A3": [1, np.nan, 3, np.nan, 5,np.nan,np.nan],
    "B1": [10, 11, 1, 13, np.nan,20,40],
    "B2": [11, 12, np.nan, 14, np.nan,25,np.nan],
    "B3": [10, 11, 1, 13, np.nan,np.nan, np.nan]
}
df = pd.DataFrame(data)
df

Unnamed: 0,A1,A2,A3,B1,B2,B3
0,1.0,2.0,1.0,10.0,11.0,10.0
1,2.0,3.0,,11.0,12.0,11.0
2,3.0,4.0,3.0,1.0,,1.0
3,,,,13.0,14.0,13.0
4,5.0,6.0,5.0,,,
5,4.0,4.0,,20.0,25.0,
6,4.0,,,40.0,,


In [None]:
conditions = ["A", "A", "A", "B", "B", "B"]
imputed_df = impute_proteomics_data(df, conditions)
imputed_df

detection_limit 11.5
detection_limit 3.5


Unnamed: 0,A1,A2,A3,B1,B2,B3
0,3.5,3.5,3.5,11.5,11.5,11.5
1,3.5,3.5,3.5,11.5,12.0,11.5
2,3.5,4.0,3.5,11.5,11.5,11.5
3,2.94932,4.251355,2.497197,13.0,14.0,13.0
4,5.0,6.0,5.0,11.356103,12.932057,11.023701
5,4.0,4.0,3.5,20.0,25.0,21.999991
6,4.0,4.297517,3.648759,40.0,52.111114,45.555528


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()