# impute_missing

> Fill in a module description here

In [None]:
#| default_exp impute_missing

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
#| export
def impute_proteomics_data(df, conditions):
    
    def impute_detection_limit(condition_df, detection_limit):
        std_dev = detection_limit * 0.1
        imputed_df = condition_df.applymap(lambda x: np.random.normal(detection_limit, std_dev) if pd.isnull(x) else x)
        return imputed_df


    def compute_detection_limit(condition_df):
        smallest_values = []
        for column in condition_df.columns:
            column_data = condition_df[column]
            column_non_zero = column_data[column_data > 0]
            smallest_values.extend(column_non_zero.nsmallest(10).tolist())

        detection_limit = np.median(smallest_values)
        return detection_limit

    unique_conditions = set(conditions)
    imputed_dfs = []

    for condition in unique_conditions:
        condition_indices = [i for i, c in enumerate(conditions) if c == condition]
        condition_df = df.iloc[:, condition_indices]

        all_values_missing = condition_df.isnull().all(axis=1)
        missing_rows = condition_df[all_values_missing]
        non_missing_rows = condition_df[~all_values_missing]

        detection_limit = compute_detection_limit(condition_df)

        if not missing_rows.empty:
            imputed_missing_rows = impute_detection_limit(missing_rows, detection_limit)
        else:
            imputed_missing_rows = missing_rows
        
        #print(non_missing_rows)
        imp_mean = IterativeImputer(random_state=0, imputation_order='roman')
        imputed_non_missing_rows = pd.DataFrame(imp_mean.fit_transform(non_missing_rows),
                                                index=non_missing_rows.index,columns=non_missing_rows.columns)

        combined_df = pd.concat([imputed_missing_rows, imputed_non_missing_rows]).sort_index()
        imputed_dfs.append(combined_df)

    imputed_df = pd.concat(imputed_dfs, axis=1).reindex(columns=df.columns)
    return imputed_df

In [None]:
# Example usage
data = {
    "A1": [1,    2, 3, np.nan, 5,4,4],
    "A2": [2,    3, 4, np.nan, 6,4,np.nan],
    "A3": [1, np.nan, 3, np.nan, 5,np.nan,np.nan],
    "B1": [10, 11, 1, 13, np.nan,20,40],
    "B2": [11, 12, np.nan, 14, np.nan,25,np.nan],
    "B3": [10, 11, 1, 13, np.nan,np.nan,np.nan]
}
df = pd.DataFrame(data)
df

Unnamed: 0,A1,A2,A3,B1,B2,B3
0,1.0,2.0,1.0,10.0,11.0,10.0
1,2.0,3.0,,11.0,12.0,11.0
2,3.0,4.0,3.0,1.0,,1.0
3,,,,13.0,14.0,13.0
4,5.0,6.0,5.0,,,
5,4.0,4.0,,20.0,25.0,
6,4.0,,,40.0,,


In [None]:
conditions = ["A", "A", "A", "B", "B", "B"]
imputed_df = impute_proteomics_data(df, conditions)
imputed_df

Unnamed: 0,A1,A2,A3,B1,B2,B3
0,1.0,2.0,1.0,10.0,11.0,10.0
1,2.0,3.0,2.0,11.0,12.0,11.0
2,3.0,4.0,3.0,1.0,1.999988,1.0
3,4.033542,3.49644,2.934492,13.0,14.0,13.0
4,5.0,6.0,5.0,11.848671,11.934904,11.099973
5,4.0,4.0,3.5,20.0,25.0,21.999991
6,4.0,4.297517,3.648759,40.0,52.111114,45.555528


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()