In [1]:
# Estimate missing values in a column based on similarities of row's NA with other rows that are similar
# Parameters to be provided to the function "Dataset", "Target_Column" and columns to determine similarity "sim_cols"
# Step 1 is to identify the locations of missing values in Target Column
# Step 2 is removing rows with missing values in "Target_Columns"
# Step 3 Calculate similarities of all rows to missing value in target column 
# Step 4 Select similar observations and calculate medium value for missing value in target column
# Step 5 Repeat Step 3 and Step 4 for all missing values for rows in target column
# Step 6 Include updates in the original dataset and return updated dataset

In [24]:
def near_neighbor(dataset, target_col, sim_cols, nn_count):
    
    from sklearn.neighbors import NearestNeighbors as NN
    
    # Dataframe tar_rows is a dataframe including the rows that are missing numbers in target_column
    tar_rows = pd.DataFrame()
    tar_rows = dataset.loc[pd.isna(dataset[target_col]), :]
    
    # Only include columns that are used to calculate similarity for tar_col
    tar_rows = tar_rows[sim_cols]
    
    # Step 2: Remove rows that include NaN in columns used to determine similarities

    df_sim = dataset.copy()

    for column in sim_cols:
        df_sim = df_sim[pd.notnull(df_sim[column])]
        df_sim = df_sim[pd.notnull(df_sim[target_col])]
        tar_rows = tar_rows[pd.notnull(tar_rows[column])]
        
        # Create a dataframe "df_nn" that only includes the columns for similarity following the removal of rows that include NaNs
    # in columns to be used for similarity "df_sim"

    df_nn = pd.DataFrame()
    df_nn[sim_cols] = df_sim[sim_cols]

    # Create an instance of nearest neighbor inclusive of the similarity columns dataframe


    for index, row in tar_rows.iterrows():
        tot_tar = 0.0
        counter = 0
        nn_mean = 0
        
        
        nbrs = NN(n_neighbors= nn_count, algorithm='auto').fit(df_nn)
        
        distances, indices = nbrs.kneighbors(np.reshape(tar_rows.loc[index].values,(1,-1)))
        
        for row_num in indices:
            
            # tot_tar = tot_tar + df_index.loc[row_num][target_col]
            tot_tar = tot_tar + dataset.loc[row_num][target_col]
            counter += 1

        nn_mean = tot_tar.sum() / tot_tar.count()
        dataset.at[index, target_col] = nn_mean
        
    return dataset
         
    

In [25]:
import pandas as pd
import numpy as np

df = pd.read_csv('MV2_example.csv')
target_col = "Amount"
sim_cols = ["Mortgage", "Car Loan", "Short Term", "Age", "Male"]
nn_count = 3

new_dataset = near_neighbor2(df, target_col, sim_cols, nn_count)

In [26]:
df = pd.read_csv('MV2_example.csv')
df.head(14)

Unnamed: 0,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
5,165777.0,1.0,0.0,0.0,Mortgage,39,
6,,0.0,1.0,0.0,Mortgage,23,1.0
7,145000.0,,,,,27,0.0
8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0


In [27]:
new_dataset

Unnamed: 0,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
5,165777.0,1.0,0.0,0.0,Mortgage,39,
6,65333.333333,0.0,1.0,0.0,Mortgage,23,1.0
7,145000.0,,,,,27,0.0
8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0
