In [142]:
import pandas as pd
from numpy import nan as NaN
import math
from copy import deepcopy
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
import numpy as np

df_train = pd.read_csv("./data/train.csv") #Reading the dataset in a dataframe using Pandas

print (df_train.isnull().sum())

df_train.dtypes

def get_categorical_cols(dataframe):
    cols = dataframe.columns
    num_cols = dataframe._get_numeric_data().columns
    categories = (set(cols) - set(num_cols))
    return categories
        
def labelChanger(df_source, rows_to_predict):    
    categories = get_categorical_cols(df_source) # we want to change lable only to categorical columns
    for categoy in categories:        
        lb_make = preprocessing.LabelEncoder()
        lb_make.fit(df_source[categoy].tolist())
        df_source[categoy] = lb_make.transform(df_source[categoy].tolist())        
        rows_to_predict[categoy] = lb_make.transform(rows_to_predict[categoy].tolist())
        
def get_indices_of_null_records(dataset, attribute):
    # get the indices where value na of the arrtibute is null
    if dataset[attribute].dtype in ['int64', 'float64']:
        indices = [i for i, x in enumerate(dataset[attribute]) if math.isnan(x)]
    else:
        indices = [i for i, x in enumerate(dataset[attribute]) if x is NaN]
    return (indices)

def get_records_with_missing_values(dataset, attribute):
    indices = get_indices_of_null_records(dataset, attribute)
    
    rows_missing_value = pd.DataFrame()
    chunks=[]
    for index in indices:
        x = deepcopy(dataset.loc[[index]])
        chunks.append(x)
        
    rows_missing_value = pd.concat(chunks, ignore_index=True)
    #rows_to_predict = deepcopy(rows_to_predict)    
    return rows_missing_value

'''
    function gets dataframe and fill NA in every column with the column mode
'''
def fill_with_mode(dataframe):
    for column in dataframe:
        dataframe[column].fillna(dataframe[column].mode())
        
def fill_na_by_prediction(dataset, attribute):
    # backup the data
    data_backup_copy = deepcopy(dataset)
    
    # x_data will be dataset with no missing values, the model will be fit on it
    x_data = dataset
    
    # delete columns. ID is irelevant, Load Status we dont have it in test set
    x_data.drop('Loan_ID',1, inplace = True)
    x_data.drop('Loan_Status',1, inplace = True)
    
    # save the indecies of missing value records
    indecies = get_indices_of_null_records(x_data, attribute)
        
    # get the rows with missing values into dataframe
    rows_with_missing_values = get_records_with_missing_values(x_data, attribute)    
    
    # remove records with na in x_data. To fit the model we need x_data to be with no NA values
    x_data.dropna(how='any', inplace=True)
    
    # get the target, the column of the attribute
    y_target = x_data[attribute]
    
    # drop the target column from x_data
    x_data.drop(attribute, axis = 1, inplace = True)
    
    # delete the column that we want to predict in rows_with_missing_values
    rows_with_missing_values.drop(attribute, 1, inplace = True)
    
    # If we have NA values in columns that are not the target attribute - we will fill the NA with the mode
    fill_with_mode(rows_with_missing_values)

    rows_with_missing_values[attribute] = np.nan
        
    labelChanger(x_data, rows_with_missing_values)
    
    knn_clf = KNeighborsClassifier(n_neighbors=3)
    knn_clf.fit(x_data, y_target)
    
    # predict the missing values and place them in the dataset
    for index, row in rows_with_missing_values.iterrows():
        print index
        row.drop(attribute, inplace= True)
        try:
            prediction = knn_clf.predict([row])[0]
            print (prediction)
        except:
            prediction = data_backup_copy[attribute].mode()[0]
            print prediction
        
        # set  the prediction in the dataset
        missing_value_records_index = indecies[index]
        data_backup_copy.set_value(missing_value_records_index, attribute, prediction)
        
    return data_backup_copy

df_train = fill_na_by_prediction(df_train, 'Self_Employed')

print (df_train.isnull().sum())

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
0
No
1
No
2
No
3
No
4
No
5
No
6
Yes
7
No
8
No
9
No
10
No
11
No
12
No
13
No
14
No
15
No
16
No
17
No
18
No
19
No
20
No
21
No
22
No
23
No
24
No
25
No
26
No
27
No
28
No
29
No
30
No
31
No
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64
