In [11]:
#uploading the data file
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [12]:
#importing inbuild libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
import os
import gc
gc.enable()
from operator import itemgetter
from tqdm import tqdm
from scipy import optimize
from sklearn.metrics import mean_squared_error as MSE
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.gaussian_process import kernels
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")



In [13]:
#Taking log as a global variable
log = ""


In [14]:
#Establishing directoyires to read original as well as incomplete datasets
def read_subsets_and_original(BASE_PATH, ORIGINAL_BASE_PATH, dataset_name):
  sub_incomplete_dataset = os.listdir(BASE_PATH + dataset_name)
  print ("Found Total Subsets : ", len(sub_incomplete_dataset))
  original = pd.read_csv(ORIGINAL_BASE_PATH +'data12.csv', header=None)
  original = original.infer_objects()
  subsets = {}
  for each in tqdm(sub_incomplete_dataset, total=len(sub_incomplete_dataset)):
    subsets[each.split('.')[0]] = pd.read_csv(BASE_PATH + dataset_name + '/' + each, header=None)
  return subsets, original


In [15]:
#Calculate NRMS value (Formula according to the documentation)
def calculate_NRMS(y_true, y_pred):
  upper_values = y_pred - y_true
  #CHECK DOCUMENTATION ON https://numpy.org/doc/stable/reference/generated/numpy.linalg.norm.html

  #ord = 'fro' means frobenius norm that is the euclidian distance
  upper_normed = np.linalg.norm(upper_values, ord='fro')
  lower_normed = np.linalg.norm(y_true, ord='fro')
  return upper_normed / lower_normed

In [16]:
from numpy.random.mtrand import shuffle
#Build GRNN Model
class GRNN(BaseEstimator, RegressorMixin):
    #Initializing all the elements
    def __init__(self, kernel='RBF', sigma=0.7, n_splits=5, calibration='warm_start', method='L-BFGS-B', bnds=(0, None), n_restarts_optimizer=0, seed = 42):
        self.kernel = kernel
        self.sigma = sigma
        self.n_splits = n_splits
        self.calibration = calibration
        self.method = method
        self.iterations = 0
        self.bnds = bnds
        self.n_restarts_optimizer = n_restarts_optimizer
        self.seed = seed
        
    def fit(self, X, y):

        # Check that X and y have correct shape
        # X, y = check_X_y(X, y)
        
        self.X_ = X
        self.y_ = y
        bounds = self.bnds
        
        np.seterr(divide='ignore', invalid='ignore')
         #Initializaing and establishing the cost function
        def cost(sigma_):
            kf = KFold(n_splits= self.n_splits)
            kf.get_n_splits(self.X_)
            cv_err = []
            for train_index, validate_index in kf.split(self.X_):
                X_tr, X_val = self.X_[train_index], self.X_[validate_index]
                y_tr, y_val = self.y_[train_index], self.y_[validate_index]
                Kernel_def_= getattr(kernels, self.kernel)(length_scale=sigma_)
                K_ = Kernel_def_(X_tr, X_val)
                # If the distances are very high/low, zero-densities must be prevented:
                K_ = np.nan_to_num(K_)
                psum_ = K_.sum(axis=0).T # Cumulate denominator of the Nadaraya-Watson estimator
                psum_ = np.nan_to_num(psum_)
                y_pred_ = (np.dot(y_tr.T, K_) / psum_)
                y_pred_ = np.nan_to_num(y_pred_)
                cv_err.append(MSE(y_val, y_pred_.T))
                break
            return cv_err[0] ## Mean error over the k splits                        
        
        #Establising the optimization function
        def optimization(x0_):
            rlog = ""
            if len(self.bnds) > 1:
              self.bnds = (self.bnds[0], )


            try:
              if len(x0_) > 1:
                x0_ = x0_[0]
            except:
              rlog = "x0_ is Good Enough"
               # print ("x0_", x0_)
            # print ("Bounds : ", self.bnds)
            opt = optimize.minimize(cost, x0_, method=self.method, bounds=self.bnds)
            if opt['success'] is True:
                opt_sigma = opt['x']
                opt_cv_error = opt['fun']
            else:
                opt_sigma = np.full(len(self.X_[0]), np.nan)
                opt_cv_error = np.inf
                pass
            return [opt_sigma, opt_cv_error]
        
        #Regulating and calibrating sigma
        def calibrate_sigma(self):
            x0 = np.asarray(self.sigma) # Starting guess (either user-defined or measured with warm start)
            if self.n_restarts_optimizer > 0:
                # print ("################################")    
                optima = [optimization(x0)]            
                #First optimize starting from theta specified in kernel
                optima = [optimization(x0)] 
                # # Additional runs are performed from log-uniform chosen initial bandwidths
                r_s = np.random.RandomState(self.seed)
                for iteration in range(self.n_restarts_optimizer): 
                    x0_iter = np.full(len(self.X_[0]), np.around(r_s.uniform(0,1), decimals=3))
                    optima.append(optimization(x0_iter))             
            elif self.n_restarts_optimizer == 0: 
                # print ("Running SAD ONE")    
                optima = [optimization(x0)]            
            else:
                raise ValueError('n_restarts_optimizer must be a positive int!')
            
            # Select sigma from the run minimizing cost
            cost_values = list(map(itemgetter(1), optima))
            self.sigma = optima[np.argmin(cost_values)][0]
            self.cv_error = np.min(cost_values) 
            return self
        global log
        if self.calibration is 'warm_start':
            log = log + 'Executing warm start...' + '/n'
            self.bnds = (bounds,)           
            x0 = np.asarray(self.sigma)
            optima = [optimization(x0)]            
            cost_values = list(map(itemgetter(1), optima))
            self.sigma = optima[np.argmin(cost_values)][0]
            log = log + 'Warm start concluded. The optimum isotropic sigma is ' + str(self.sigma) + '/n'
            self.sigma = np.full(len(self.X_[0]), np.around(self.sigma, decimals=3))
            self.bnds = (bounds,)*len(self.X_[0])
            # print ('Executing gradient search...')
            calibrate_sigma(self)
            log = log + 'Gradient search concluded. The optimum sigma is ' + str(self.sigma) + '/n'
        elif self.calibration is 'gradient_search':
            #print ('Executing gradient search...')
            self.sigma = np.full(len(self.X_[0]), self.sigma)
            self.bnds = (bounds,)*len(self.X_[0])
            calibrate_sigma(self)
            #print('Gradient search concluded. The optimum sigma is ' + str(self.sigma))
        else:
            pass
                   
        self.is_fitted_ = True
        # Return the regressor
        return self
   #Gathering all the above and predicting the values 
    def predict(self, X):
        
         # Check if fit had been called
        # check_is_fitted(self, ['X_', 'y_'])
        
        # Input validation
        X = check_array(X)
        
        Kernel_def= getattr(kernels, self.kernel)(length_scale=self.sigma)
        K = Kernel_def(self.X_, X)
        # If the distances are very high/low, zero-densities must be prevented:
        K = np.nan_to_num(K)
        psum = K.sum(axis=0).T # Cumulate denominator of the Nadaraya-Watson estimator
        psum = np.nan_to_num(psum)
        return np.nan_to_num((np.dot(self.y_.T, K) / psum))


In [17]:
#saga feature selection
def SAGA_FEATURE_SELECTION(X_train, y_train):
  model_logistic = Ridge(solver='saga')
  sel_model_logistic = SelectFromModel(estimator=model_logistic)
  X_train_sfm_l1 = sel_model_logistic.fit_transform(X_train.values, y_train.values)
  Indicator_columns = sel_model_logistic.get_support()
  return Indicator_columns #SAGA BASED FEATURE SELECTION

In [22]:
#Reading the data to run our model on
BASE = 'gdrive/My Drive/'
BASE_PATH = BASE + 'incomplete/'
ORIGINAL_BASE_PATH = BASE + 'complete_dataset/'
data = pd.read_excel(BASE + 'Table-NRMS.xlsx')
#data = pd.read_csv(BASE + 'List.csv')


In [23]:
NRMS_DICT = {}


In [None]:
import time
begin_time = time.time()

for index, row in tqdm(data.iterrows(), total=data.shape[0]): #For Dataset
  subsets, original = read_subsets_and_original(BASE_PATH, ORIGINAL_BASE_PATH, row['Datasets']) # Get All Subsets and Original Dataset
  subset_names = list(subsets.keys())
  print(subset_names)
  #ITERATE OVER ALL SUBSETS OF A DATASET AND APPLY GRNN ON EACH ONE

  for each_subset_name in subset_names:

    #SELECTING A SUBSET
    selected_subset = subsets[each_subset_name]
    #print(selected_subset)


    new_prediction = np.zeros(shape=original.shape) #SAMPLE ARRAY TO SAVE PREDICTIONS
    #print(new_prediction.shape)
    #print(selected_subset.columns)
    new_prediction = pd.DataFrame(data = new_prediction, columns=selected_subset.columns)



    #COLUMNS ARRAY TO ITERATE
    all_cols = np.array(original.columns) 
    for each in tqdm(all_cols, total=len(all_cols)):



      #ONE COLUMN IN TEST AND OTHERS IN TRAINING
      train_cols = all_cols[all_cols != each] 
      test_col = each

      #CHECKING IF THERE ARE NULL VALUES IN OUR TEST COLUMNS
      nulls = selected_subset[each].isnull() 
     
      test_index = nulls[nulls == True].index
      train_index = nulls[nulls == False].index
    #  print(test_index.shape)
      #print(test_index.shape[0] / float(nulls.shape[0]))



      #IF THERE IS NO NULL VALUE THEN WE WONT APPLY GRNN
      if test_index.shape[0] == 0:
        new_prediction[each] = original[each].copy()
    

      else:

        #TRAIN GRNN ON INDEX WHERE THERE IS NO NULL AND PREDICT ON NULL VALUES
        custom_GRNN = GRNN()
        #print('in grnn')
        SAGA_BASED_FEATURES = SAGA_FEATURE_SELECTION(original[train_cols].loc[train_index], original[test_col].loc[train_index])
        #print(SAGA_BASED_FEATURES) #SAGA
    


        #Normalization
        normalizer = StandardScaler()

        train_X = original[train_cols[SAGA_BASED_FEATURES]].loc[train_index].values
        train_Y = original[test_col].loc[train_index].values
       # print(train_X)
        test_X = original[train_cols[SAGA_BASED_FEATURES]].loc[test_index].values
        #print(train_X, train_Y)

        normalizer.fit(train_X, train_Y)

        normalizer_train_X = normalizer.transform(train_X)
        normalizer_test_X = normalizer.transform(test_X)
        ############################################
        ############################################
        


        custom_GRNN.fit(normalizer_train_X, train_Y)
        #PREDICT
        prediction_smothened = custom_GRNN.predict(normalizer_test_X)
        #print(prediction_smothened)

        #FILL OUR SAVING ARRAY WITH PREDICTIONS
        new_prediction[each].loc[train_index] = selected_subset[each].loc[train_index]
        new_prediction[each].loc[test_index] = prediction_smothened
       # print(each_subset_name)
       # print(new_prediction)
        new_prediction.to_csv(BASE + "imputed_" + each_subset_name + ".csv", index=False)
        NRMSE = calculate_NRMS(original.values, new_prediction.values)
        print(NRMSE)
        log = log + "Done Smoothing of : " + each_subset_name + " with NRMS : " + str(NRMSE) + '/n/n/n'
        NRMS_DICT[each_subset_name] = NRMSE

end_time = time.time()
diff = end_time - begin_time
print(diff)
print(NRMS_DICT)
#print(log)
#df = pd.DataFrame(NRMS_DICT.values())
value = pd.DataFrame.from_dict(NRMS_DICT, orient='index')
#logs = log.split('/n')
#for each in logs:
#    x = each.split(' ')   
#    if len(x) > 0:
#        if x[0] == 'Done':
##            df['NRMS'] = x[8]
 #           df['Datasets'] = x[4]
 #          print(x[8], x[4])#

print(value)
value.to_csv('Table.csv')

  0%|          | 0/38 [00:00<?, ?it/s]

Found Total Subsets :  1



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.35it/s]


['Data_12_AG_10%']



  0%|          | 0/51 [00:00<?, ?it/s][A
  2%|▏         | 1/51 [01:19<1:06:28, 79.76s/it][A

0.9962064525069462



  4%|▍         | 2/51 [02:29<1:00:08, 73.65s/it][A

0.9927516946935744



  6%|▌         | 3/51 [03:42<58:54, 73.63s/it]  [A

0.9892970662926144



  8%|▊         | 4/51 [04:54<57:01, 72.79s/it][A

0.985499801405672



 10%|▉         | 5/51 [06:06<55:33, 72.48s/it][A

0.982067598119838



 14%|█▎        | 7/51 [07:16<39:12, 53.46s/it][A

0.9748374684445172



 16%|█▌        | 8/51 [09:01<48:08, 67.18s/it][A

0.9709674793053821



 18%|█▊        | 9/51 [10:15<48:26, 69.21s/it][A

0.967332605019645



 20%|█▉        | 10/51 [12:05<55:06, 80.66s/it][A

0.9633560024306208



 22%|██▏       | 11/51 [13:24<53:32, 80.31s/it][A

0.9596323404483786



 24%|██▎       | 12/51 [15:44<1:03:19, 97.43s/it][A

0.9557773412917548



 25%|██▌       | 13/51 [17:30<1:03:24, 100.11s/it][A

0.951467769051495



 29%|██▉       | 15/51 [18:48<43:18, 72.17s/it]   [A

0.943374213863127



 31%|███▏      | 16/51 [20:36<47:18, 81.09s/it][A

0.9390939441153707



 35%|███▌      | 18/51 [22:15<37:22, 67.96s/it][A

0.8642521575197007



 37%|███▋      | 19/51 [23:58<40:29, 75.91s/it][A

0.7926963537712359



 39%|███▉      | 20/51 [25:32<41:35, 80.49s/it][A

0.7879069938558851



 41%|████      | 21/51 [26:50<39:54, 79.82s/it][A

0.7836881946879954



 43%|████▎     | 22/51 [28:29<41:01, 84.87s/it][A

0.7791907862184076



 45%|████▌     | 23/51 [30:26<43:45, 93.77s/it][A

0.7747733702810468



 47%|████▋     | 24/51 [32:36<46:50, 104.11s/it][A

0.7697798731708003



 49%|████▉     | 25/51 [34:07<43:30, 100.42s/it][A

0.7648450555418388



 51%|█████     | 26/51 [35:33<40:07, 96.31s/it] [A

0.7601050225767478



 53%|█████▎    | 27/51 [37:19<39:37, 99.08s/it][A

0.7546026189762047



 55%|█████▍    | 28/51 [38:52<37:16, 97.26s/it][A

0.6718190844677029



 57%|█████▋    | 29/51 [40:29<35:35, 97.08s/it][A

0.666380832485477



 59%|█████▉    | 30/51 [42:03<33:43, 96.36s/it][A

0.6602853279046967



 61%|██████    | 31/51 [43:22<30:22, 91.13s/it][A

0.6550267467830225



 63%|██████▎   | 32/51 [45:15<30:53, 97.55s/it][A

0.6492395071054988



 65%|██████▍   | 33/51 [46:47<28:47, 95.96s/it][A

0.5447276495234605



 67%|██████▋   | 34/51 [48:14<26:23, 93.15s/it][A

0.5380351589984463



 69%|██████▊   | 35/51 [50:00<25:54, 97.17s/it][A

0.5311191897665728



 71%|███████   | 36/51 [51:21<23:02, 92.14s/it][A

0.5236859161704415



 73%|███████▎  | 37/51 [53:10<22:41, 97.22s/it][A

0.516287204912546



 75%|███████▍  | 38/51 [54:32<20:04, 92.63s/it][A

0.5089545438666783



 76%|███████▋  | 39/51 [55:50<17:40, 88.36s/it][A

0.3423120426335662



 78%|███████▊  | 40/51 [57:32<16:57, 92.48s/it][A

0.3325266683593524



 80%|████████  | 41/51 [58:50<14:41, 88.10s/it][A

0.32096304753397953



 82%|████████▏ | 42/51 [59:58<12:20, 82.24s/it][A

0.30936137852089013



 84%|████████▍ | 43/51 [1:01:31<11:22, 85.29s/it][A

0.29675570112718863



 86%|████████▋ | 44/51 [1:02:47<09:38, 82.64s/it][A

0.28421696312763917



 88%|████████▊ | 45/51 [1:04:09<08:14, 82.35s/it][A

0.2724596023593752



 90%|█████████ | 46/51 [1:06:09<07:48, 93.78s/it][A

0.2573781833581079



 92%|█████████▏| 47/51 [1:07:25<05:53, 88.38s/it][A

0.24411905602909345



 94%|█████████▍| 48/51 [1:08:43<04:15, 85.30s/it][A

0.22891271891249917



 96%|█████████▌| 49/51 [1:10:23<02:58, 89.49s/it][A

0.21142963324362768



100%|██████████| 51/51 [1:11:48<00:00, 84.48s/it]
  3%|▎         | 1/38 [1:11:50<44:17:53, 4310.09s/it]

0.1939019999046496
Found Total Subsets :  1



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.25it/s]


['Data_12_AG_20%']



  0%|          | 0/51 [00:00<?, ?it/s][A