In [2]:
# Estimate missing values in a column based on similarities of row's NA with other rows that are similar
# Parameters to be provided to the function "Dataset", "Target_Column" and columns to determine similarity "sim_cols"
# Step 1 is to identify the locations of missing values in Target Column
# Step 2 is removing rows with missing values in "Target_Columns"
# Step 3 Calculate similarities of all rows to missing value in target column 
# Step 4 Select similar observations and calculate medium value for missing value in target column
# Step 5 Repeat Step 3 and Step 4 for all missing values for rows in target column
# Step 6 Include updates in the original dataset and return updated dataset

In [325]:
def near_neighbor(dataset, target_col, sim_cols, nn_count):
    
    from sklearn.neighbors import NearestNeighbors as NN
    
    # Dataframe tar_rows is a dataframe including the rows that are missing numbers in target_column
    tar_rows = pd.DataFrame()
    tar_rows = df.loc[pd.isna(df[target_col]), :]
    
    # Only include columns that are used to calculate similarity for tar_col
    tar_rows = tar_rows[sim_cols]
    
    # Step 2: Remove rows that include NaN in columns used to determine similarities

    df_sim = df.copy()

    for column in sim_cols:
        df_sim = df_sim[pd.notnull(df_sim[column])]
        df_sim = df_sim[pd.notnull(df_sim[target_col])]
        tar_rows = tar_rows[pd.notnull(tar_rows[column])]
        
        # Create a dataframe "df_nn" that only includes the columns for similarity following the removal of rows that include NaNs
    # in columns to be used for similarity "df_sim"

    df_nn = pd.DataFrame()
    df_nn[sim_cols] = df_sim[sim_cols]

    # Create an instance of nearest neighbor inclusive of the similarity columns dataframe


    for index, row in tar_rows.iterrows():
        tot_tar = 0.0
        counter = 0
        nn_mean = 0
        
        
        nbrs = NN(n_neighbors= nn_count, algorithm='auto').fit(df_nn)
        
        distances, indices = nbrs.kneighbors(np.reshape(tar_rows.loc[index].values,(1,-1)))
        
        for row_num in indices:
            
            tot_tar = tot_tar + df_index.loc[row_num][target_col]
            counter += 1

        nn_mean = tot_tar.sum() / tot_tar.count()
        df.at[index, target_col] = nn_mean
        
    return df
         
    
    

In [326]:
import pandas as pd
import numpy as np

df = pd.read_csv('MV2_example.csv')
target_col = "Amount"
sim_cols = ["Mortgage", "Car Loan", "Short Term", "Age", "Male"]
nn_count = 3

new_dataset = near_neighbor(df, target_col, sim_cols, nn_count)

Row of missing amount 6
Distances of closest neighbors [[0.         3.         4.35889894]]
Index locations of closest neighbors [[1 7 0]]
1     1000.0
7     7000.0
0    50000.0
Name: Amount, dtype: float64
Row of missing amount 11
Distances of closest neighbors [[ 6.244998  7.       10.      ]]
Index locations of closest neighbors [[2 3 5]]
2     27000.0
3    655555.0
5    156899.0
Name: Amount, dtype: float64


In [327]:
new_dataset

Unnamed: 0,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
5,165777.0,1.0,0.0,0.0,Mortgage,39,
6,19333.333333,0.0,1.0,0.0,Mortgage,23,1.0
7,145000.0,,,,,27,0.0
8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0


In [314]:
# Step 1: Capture the location of the rows with NA for target column
target_col = "Amount"
sim_cols = ["Mortgage", "Car Loan", "Short Term", "Age", "Male"]
nn_count = 3

# Dataframe tar_rows is a dataframe including the rows that are missing numbers in target_column
tar_rows = pd.DataFrame()
tar_rows = df.loc[pd.isna(df[target_col]), :]

# Only include columns that are used to calculate similarity for tar_col
tar_rows = tar_rows[sim_cols]
print(tar_rows)

    Mortgage  Car Loan  Short Term  Age  Male
6        0.0       1.0         0.0   23   1.0
11       1.0       0.0         0.0   38   0.0


In [315]:
# Step 2: Remove rows that include NaN in columns used to determine similarities

df_sim = df.copy()

for column in sim_cols:
    df_sim = df_sim[pd.notnull(df_sim[column])]
    df_sim = df_sim[pd.notnull(df_sim[target_col])]
    tar_rows = tar_rows[pd.notnull(tar_rows[column])]
    


In [316]:
df_sim

Unnamed: 0,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0
10,7000.0,0.0,1.0,0.0,Car,26,1.0


In [317]:
df_nn

Unnamed: 0,Mortgage,Car Loan,Short Term,Age,Male
0,1.0,0.0,0.0,19,0.0
1,0.0,1.0,0.0,23,1.0
2,0.0,1.0,0.0,44,1.0
3,1.0,0.0,0.0,45,0.0
4,1.0,0.0,0.0,65,0.0
8,1.0,0.0,0.0,48,0.0
9,0.0,0.0,1.0,55,1.0
10,0.0,1.0,0.0,26,1.0


In [247]:
# Create a dataframe "df_nn" that only includes the columns for similarity following the removal of rows that include NaNs
# in columns to be used for similarity "df_sim"

# df_nn = pd.DataFrame()
# df_nn[sim_cols] = df_sim[sim_cols]

# Create an instance of nearest neighbor inclusive of the similarity columns dataframe



#nbrs = NN(n_neighbors= nn_count, algorithm='auto').fit(df_nn)
# distances, indices = nbrs.kneighbors(np.reshape(x_6.values,(1,-1)))
# distances, indices = nbrs.kneighbors(np.reshape(X2_sim.loc[6].values,(1,-1)))
#distances, indices = nbrs.kneighbors(np.reshape(tar_rows.loc[6].values,(1,-1)))
#nn_mean = df_sim.loc[indices[0][1:5], target_col].values.mean()
#df.at[index, target_col] = nn_mean



In [323]:
# Create a dataframe "df_nn" that only includes the columns for similarity following the removal of rows that include NaNs
# in columns to be used for similarity "df_sim"

df_nn = pd.DataFrame()
df_nn[sim_cols] = df_sim[sim_cols]

# Create an instance of nearest neighbor inclusive of the similarity columns dataframe


for index, row in tar_rows.iterrows():
    tot_tar = 0.0
    counter = 0
    nn_mean = 0
    nbrs = NN(n_neighbors= nn_count, algorithm='auto').fit(df_nn)
    print(f'Row of missing amount {index}')
    distances, indices = nbrs.kneighbors(np.reshape(tar_rows.loc[index].values,(1,-1)))
    # indices = np.delete(indices, 0)
    print(f'Distances of closest neighbors {distances}')
    print(f'Index locations of closest neighbors {indices}')
    for row_num in indices:
        
        # print(df_index.loc[row_num][target_col])
        tot_tar = tot_tar + df_index.loc[row_num][target_col]
        print(tot_tar)
        counter += 1
        
    nn_mean = tot_tar.sum() / tot_tar.count()
    df.at[index, target_col] = nn_mean
    


Row of missing amount 6
Distances of closest neighbors [[0.         3.         4.35889894]]
Index locations of closest neighbors [[1 7 0]]
1     1000.0
7     7000.0
0    50000.0
Name: Amount, dtype: float64
Row of missing amount 11
Distances of closest neighbors [[ 6.244998  7.       10.      ]]
Index locations of closest neighbors [[2 3 5]]
2     27000.0
3    655555.0
5    156899.0
Name: Amount, dtype: float64


In [298]:
distances, indices = nbrs.kneighbors(np.reshape(tar_rows.loc[11].values,(1,-1)))

In [324]:
df


Unnamed: 0,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
5,165777.0,1.0,0.0,0.0,Mortgage,39,
6,19333.333333,0.0,1.0,0.0,Mortgage,23,1.0
7,145000.0,,,,,27,0.0
8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0


In [272]:
df_index = df_sim.copy()
df_index = df_sim.reset_index()
df_index


Unnamed: 0,index,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
5,8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
6,9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0
7,10,7000.0,0.0,1.0,0.0,Car,26,1.0


In [176]:
tot_tar = 0
for row_num in indices:
    print(f'Row number {row_num}')
    print(df_sim.loc[row_num][target_col])
    print (tot_tar)
    tot_tar = tot_tar + df_sim.loc[row_num][target_col]
   
    

Row number 1
1000.0
0
Row number 8
156899.0
1000.0


In [83]:
indices[0]

array([5, 9, 1, 8], dtype=int64)

In [84]:
distances

array([[0.        , 1.        , 1.73205081, 3.46410162]])

In [106]:
# np.mean(df.loc[[indices[0]],'Amount'].values)
nn_values = df_sim.loc[[indices[0]].values.mean

SyntaxError: unexpected EOF while parsing (<ipython-input-106-f8589d64df1a>, line 2)

In [112]:
df_sim.loc[indices[0][1:5], 'Amount'].values.mean()

57633.0

In [48]:
def cat_cols (dataset, sel_cols, remove_one = "False"):
    
    df1 = dataset.copy()
    df1.head()
    v = sel_cols
    d_first = remove_one

    df_cat = df1[v]
    df1.drop(v, axis=1, inplace=True)
    
    if d_first == "True":
        
        cat_df = pd.get_dummies(df_cat[v],  drop_first=True)
        
    else:
        
        cat_df = pd.get_dummies(df_cat[v],  drop_first=False)
    
    dataset = pd.concat([df1, cat_df], axis=1, sort=False)
    
    
    return dataset

In [91]:
df

Unnamed: 0,Amount,Mortgage,Car Loan,Short Term,Loan type,Age,Male
0,50000.0,1.0,0.0,0.0,Mortgage,19,0.0
1,1000.0,0.0,1.0,0.0,Car loan,23,1.0
2,27000.0,0.0,1.0,0.0,Car loan,44,1.0
3,655555.0,1.0,0.0,0.0,Mortgage,45,0.0
4,187666.0,1.0,0.0,0.0,Mortgage,65,0.0
5,165777.0,1.0,0.0,0.0,Mortgage,39,
6,,1.0,0.0,0.0,Mortgage,23,0.0
7,145000.0,,,,,27,0.0
8,156899.0,1.0,0.0,0.0,Mortgage,48,0.0
9,15000.0,0.0,0.0,1.0,Short-term credit,55,1.0
