In [29]:
# # # Import libraries
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler


import warnings
warnings.filterwarnings('ignore')


In [30]:
# # #  import data
df = pd.read_csv('/content/drive/MyDrive/dataset/PEMS_missing_data/PEMS_missing_400001_25.csv')

In [31]:
# # #  Parameter configurations
# SIGMA = 0.5
# OUTLIER_PECENTAGE = 1
PERCENTAGE_REPAIRED = 522*25
    
length = df.shape[0]

columns = df.columns.tolist()

# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)

In [32]:
# # # FID for repairing data
def FID_repaired(working_list):
    # working_list = x1
    
    index = []
    for i, j in enumerate(working_list):
        if j == 'NaN':
            index.append(i)
    
    # count the number of NaN/compromised point
    p1 = working_list.count('NaN')
    # print(p1)
    
    t = p1 # Total number of missing value
    
    # Select the min & max from list
    # working_list.remove('NaN')
    count=0
    for index_pos in index:
        working_list.pop(index_pos-count)
        count+=1
    
    # find mean of all observed values
    mean = np.mean(working_list)
    
    #find min value
    a = min(working_list)
    
    #find max value
    b = max(working_list)
    
    # Calculate h = (b-a)/t
    h = (b-a)/t
    
    # Calculate the discrete universe U using u = (a + (s-1) x h + a + s x h)/2, s=1,2,3
    U = []
    for s in range(1,t+1):
        u = (a + (s-1) * h + a + s * h)/2
        U.append(u)
    
    # print(U)
        
    # Calculating the missing values
    M = []    
    for u in U:
        # print(U)
        
        # Compute the contribution weight (micro) of each observed element x_i
        
        contribution_weight_list = []
        
        for i in working_list:
            if abs(i-u) <= h:
                temp = 1-(abs(i-u)/h)
            else:
                temp = 0
            contribution_weight_list.append(temp)
        
        # Calculate the sum of x_i to u1:
        sum_contribution_weight_list = sum(contribution_weight_list)
        # print(sum_contribution_weight_list)
        
        # Calculate the contribution of an observed data x_i
        sum_contribution_observed_data = []
        
        for num1, num2 in zip(working_list, contribution_weight_list):
        	sum_contribution_observed_data.append(num1 * num2)
        
        sum_contribution_observed_data = sum(sum_contribution_observed_data)
        # print(sum_contribution_observed_data)
        
        # Calculate the missing values in x_i
        if sum_contribution_weight_list == 0:
            m = mean
        else:
            m = sum_contribution_observed_data/sum_contribution_weight_list
        
        M.append(m)
    
    # print('The values:',M)
    # print('The index position:',index)
    return [index,M]

In [33]:
# # # Repairing data
data = df['400001'].to_list()

# # # Determine the outlier for repairing
index_outlier = []

y = df['label3']
count_outlier = 0
for i, j in enumerate(y):
    if j == 0.0:
        index_outlier.append(i)
        count_outlier+=1
    if count_outlier == PERCENTAGE_REPAIRED: break

# Determine the compromised data
for ind in index_outlier:
    data[ind] = 'NaN'

# Recover compromised data
working_list = data

results = FID_repaired(working_list)

# print('The recovered values:',results[1])
# print('The index position:',results[0])

In [34]:
# # # Update the predicted data into dataset
pos = 0
for index_pos in results[0]:
  df_1 = df.iloc[index_pos]
  df_1['400001'] = results[1][pos]
  df_1['label3'] = 1

  df.loc[index_pos] = df_1
  pos+=1

In [35]:
# # # # Splitting the dataset
# y = df['label2']
# X = df.drop(['index', 'label','label2'], axis =1)

# # # New noisy dataset
df.to_csv('/content/drive/MyDrive/dataset/PEMS_missing_data/PEMS_missing_repaired_400001_25.csv')