In [842]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from sklearn.neighbors import NearestNeighbors
from scipy.spatial import distance

In [843]:
# Read the data
folder = "/Users/schmuck/Library/CloudStorage/OneDrive-IndianaUniversity/PhD/DSAA_23/data/LowFreqData"
train_filename = "Raw_train_data.csv"
test_filename = "Raw_test_data.csv"
train_data = pd.read_csv(os.path.join(folder, train_filename))
test_data = pd.read_csv(os.path.join(folder, test_filename))
print(test_data.shape)

(77, 4)


In [844]:
# Create whole data
whole_data = np.array(pd.concat([train_data, test_data], axis=0))
print(whole_data.shape)

# Find global and volt wise average
glob_real_meu = np.mean(whole_data[:, 2])
glob_imag_meu = np.mean(whole_data[:, 3])

print(glob_real_meu, glob_imag_meu)

(533, 4)
0.060587598499061916 0.23309621622889304


In [845]:
# Find volt and Frwq wise meu
volt_meu = {}
volt_sd = {}

freq_meu = {}
freq_sd = {}

u_volts = np.unique(whole_data[:, 0])
u_freqs = np.unique(whole_data[:, 1])


for volt in u_volts:
    
    volt_meu[volt] = [np.mean(whole_data[np.where(whole_data[:, 0] == volt), 2]), np.mean(whole_data[np.where(whole_data[:, 0] == volt), 3])]
    volt_sd[volt]  = [np.std(whole_data[np.where(whole_data[:, 0] == volt), 2]), np.std(whole_data[np.where(whole_data[:, 0] == volt), 3])]


for freq in u_freqs:

    freq_meu[freq] = [np.mean(whole_data[np.where(whole_data[:, 1] == freq), 2]), np.mean(whole_data[np.where(whole_data[:, 1] == freq), 3])]
    freq_sd[freq] = [np.std(whole_data[np.where(whole_data[:, 1] == freq), 2]), np.std(whole_data[np.where(whole_data[:, 1] == freq), 3])]

print(volt_meu)
print(volt_sd)

print(freq_meu)
print(freq_sd)


{3.177: [0.1315035164835165, 0.7816492193406592], 3.387: [0.04424061728395062, 0.02765265975308642], 3.65: [0.03500186813186813, 0.08220925604395604], 3.879: [0.045561648351648346, 0.09940095362637363], 4.187: [0.053253068181818186, 0.13193815988636362], 4.193: [0.0519267032967033, 0.2498159527472527]}
{3.177: [0.17098093011315507, 2.3186609831050675], 3.387: [0.03034656033738121, 0.060702540650058506], 3.65: [0.03378709103616213, 0.2428475340422501], 3.879: [0.04665991359661285, 0.2811675525043224], 4.187: [0.04273898960705219, 0.373600703169352], 4.193: [0.0468222460055034, 0.745901680624925]}
{9.999e-06: [0.38812500000000005, 5.468], 1.259e-05: [0.30695, 4.29875], 1.585e-05: [0.317625, 3.4719250000000006], 1.995e-05: [0.2569, 2.6655799999999994], 2.512e-05: [0.22519999999999998, 2.1241000000000003], 3.162e-05: [0.209578, 1.6901], 3.983e-05: [0.187718, 1.34868], 5.009e-05: [0.172646, 1.07906], 6.308e-05: [0.15692599999999998, 0.8623], 7.948e-05: [0.14768, 0.6908799999999999], 9.99e-0

In [846]:
# # Find how many new rows are required
# ### Algorithm
# syn_data = [[], []]

# """
# For each potential,
# Parse all the frequency values that lie in that cell potential
# and find if there are marge gaps between frequency values,
# for e.g. if there is a gap of 0.10, 0.20 between successive frequency values.

# Based on manual data eye-balling- this pattern was observed for low frequency values
# so, depending on the size of the gap - put artificial frequency values in the specific gap to 
# fill that gap, the value of voltage will be the same as the original voltage
# """

# syn_volt = []
# syn_freq = []
# syn_z_r = []
# syn_z_i = []

# # u_volts = [3.177]

# for volt in u_volts:

#     print("Volt: ", volt)
#     # print("Before: Number of Volts: ", temp.shape[0])
    
#     # Subset the data
#     temp = whole_data[np.where(whole_data[:, 0] == volt)[0], :]
#     temp = temp[temp[:, 2].sort(), :].reshape(temp.shape)

#     z_real = temp[:, 2]

#     z_f = 0
#     z_s = 1


#     ### real impedance
#     # Parse the frequencies
#     while z_s < len(z_real):

#         diff = z_real[z_s] - z_real[z_f]
        
#         # Check if synthetic values are needed, at all!
#         if diff >= volt_meu[volt][0]:

#             # How many synthetic values?
#             num_vals = int(np.round(np.ceil(diff/volt_meu[volt][0])))

#             # get corresponding frquencies
#             start_freq = temp[z_f, 1]
#             end_freq = temp[z_s, 1]

#             freq_interval = (end_freq - start_freq)/num_vals

#             for i in range(num_vals):
#                 noise = np.random.normal(size=1, scale=0.00001)[0]
#                 syn_freq.append(start_freq+(freq_interval*(i+1))+noise)
#                 syn_z_r.append(0)
#                 syn_z_i.append(0)
#                 syn_volt.append(volt)

#         z_f += 1
#         z_s += 1


#     temp = whole_data[np.where(whole_data[:, 0] == volt)[0], :]
#     temp = temp[temp[:, 3].sort(), :].reshape(temp.shape)
    
#     z_imag = temp[:, 3]

#     z_f = 0
#     z_s = 1

#     ##### Imaginary impedance
#     # Parse the frequencies
#     while z_s < len(z_imag):

#         diff = z_imag[z_s] - z_imag[z_f]
        
#         # Check if synthetic values are needed, at all!
#         if diff >= volt_meu[volt][0]:

#             # How many synthetic values?
#             num_vals = int(np.round(np.ceil(diff/volt_meu[volt][0])))

#             # get corresponding frquencies
#             start_freq = temp[z_f, 1]
#             end_freq = temp[z_s, 1]

#             freq_interval = (end_freq - start_freq)/num_vals

#             for i in range(num_vals):
#                 noise = np.random.normal(size=1, scale=0.00001)[0]
#                 syn_freq.append(start_freq+(freq_interval*(i+1))+noise)
#                 syn_z_r.append(0)
#                 syn_z_i.append(0)
#                 syn_volt.append(volt)

#         z_f += 1
#         z_s += 1


# # print("After: Number of Volts: ", len(syn_volt))
# # print("After: Number of Freqs: ", len(syn_freq))

# new_syn_data = np.array(pd.DataFrame(list(zip(syn_volt, syn_freq, syn_z_r, syn_z_i)), columns=["Volt", "Freq", "Zreal", "Zimag"]))

# # # whole_data = pd.DataFrame(whole_data)
# # # whole_data.columns = ["Volt", "Freq", "Zreal", "Zimag"]

# # new_syn_data = pd.concat([whole_data, new_syn_data], axis=0)

# # print("Shape before synthetic data: ", whole_data.shape)
# print("Shape after synthetic data: ", new_syn_data.shape)


In [847]:
# a = [1,2,3]
# b = [4,5,6]
# c = pd.DataFrame(list(zip(a, b)), columns=["A", "B"])
# c

# np.random.normal(size=1, scale=0.00001)[0]

In [848]:
# Find how many new rows are required
### Algorithm
syn_data = [[], []]

"""
For each potential,
Parse all the frequency values that lie in that cell potential
and find if there are marge gaps between frequency values,
for e.g. if there is a gap of 0.10, 0.20 between successive frequency values.

Based on manual data eye-balling- this pattern was observed for low frequency values
so, depending on the size of the gap - put artificial frequency values in the specific gap to 
fill that gap, the value of voltage will be the same as the original voltage
"""

num_vals = 2

syn_volt = []
syn_freq = []
syn_z_r = []
syn_z_i = []

# u_volts = [3.177]

# print(whole_data[1:10,])

for volt in u_volts:

    print("Volt: ", volt)
    
    # Subset the data
    temp = whole_data[np.where(whole_data[:, 0] == volt)[0], :]
    # temp = temp[temp[:, 2].sort(), :].reshape(temp.shape)

    my_m = np.mean(temp[:, 3])
    my_sd = np.mean(temp[:, 3])

    freqs = temp[np.where((temp[:, 1] < 0.0001) & (temp[:, 3] >= my_m + (2*my_sd)))[0], 1]

    # print("Before: Number of Volts: ", temp.shape[0])
    # print("Before: Number of Freqs: ", len(freqs))

    f_p = 0
    s_p = 1

    # Parse the frequencies
    while s_p < len(freqs):

        start_freq = freqs[f_p] 
        end_freq = freqs[s_p]

        freq_interval = (end_freq - start_freq)/num_vals

        for i in range(num_vals):
            noise = np.random.normal(size=1, scale=0.00001)[0]
            syn_freq.append(start_freq+(freq_interval*(i+1))+noise)
            syn_z_r.append(0)
            syn_z_i.append(0)
            syn_volt.append(volt)
        
        f_p += 1
        s_p += 1

    # print("After: Number of Volts: ", len(syn_volt))
    # print("After: Number of Freqs: ", len(syn_freq))

# Get everything back into a numpy array
new_syn_data = np.array(pd.DataFrame(list(zip(syn_volt, syn_freq, syn_z_r, syn_z_i)), columns=["Volt", "Freq", "Zreal", "Zimag"]))

# whole_data = pd.DataFrame(whole_data)
# whole_data.columns = ["Volt", "Freq", "Zreal", "Zimag"]

# new_syn_data = pd.concat([whole_data, new_syn_data], axis=0)

# print("Shape before synthetic data: ", whole_data.shape)
# print("Shape after synthetic data: ", new_syn_data.shape)

Volt:  3.177
Volt:  3.387
Volt:  3.65
Volt:  3.879
Volt:  4.187
Volt:  4.193


In [849]:
## Find the nearest neighbors
distances = distance.cdist(new_syn_data, new_syn_data, "euclidean")
neighbors = np.argsort(distances)
neighbors = neighbors[:, 0:50]

print("Before imputation: % of zeros: ", (len(np.where(new_syn_data == 0)[0])/(new_syn_data.shape[0] * new_syn_data.shape[1])*100))

max_k = 3
# u_volts = [3.177]

# for volt in u_volts:
#     temp = new_syn_data[np.where(new_syn_data[:, 0] == volt)[0], ]


for i in range(new_syn_data.shape[0]):

    # Check if imputation is needed
    if new_syn_data[i, 2] == 0:

        cnt = 0
        new_val = 0

        # Find the closest neighbors
        for n in neighbors[i, :]:

            if whole_data[n, 2] !=0 and cnt < max_k:
                
                new_val += whole_data[n, 2] #*distances[i, n]
                cnt +=1
            
            elif cnt >= max_k:

                new_syn_data[i, 2] = new_val/max_k
                # new_syn_data[i, 2] += glob_real_meu + (volt_meu[volt][0]-glob_real_meu)
                break


    if new_syn_data[i, 3] == 0:

        cnt = 0
        new_val = 0

        # Find the closest neighbors
        for n in neighbors[i, :]:

            if whole_data[n, 3] !=0 and cnt < max_k:
                
                new_val += whole_data[n, 3] #*distances[i, n]
                cnt +=1
            
            elif cnt >= max_k:
                
                new_syn_data[i, 3] = new_val/max_k
                # new_syn_data[i, 3] += glob_imag_meu + (volt_meu[volt][1]-glob_imag_meu)
                break 


print("After imputation: % of zeros: ", (len(np.where(new_syn_data == 0)[0])/(new_syn_data.shape[0] * new_syn_data.shape[1])*100))

new_syn_data = pd.DataFrame(new_syn_data, columns=["Volt", "Freq", "Zreal", "Zimag"])

whole_data = pd.DataFrame(whole_data)
whole_data.columns = ["Volt", "Freq", "Zreal", "Zimag"]

new_syn_data = pd.concat([whole_data, new_syn_data], axis=0)

print("Shape before synthetic data: ", whole_data.shape)
print("Shape after synthetic data: ", new_syn_data.shape)

# Save to disk
new_syn_data.to_csv("/Users/schmuck/Library/CloudStorage/OneDrive-IndianaUniversity/PhD/DSAA_23/data/LowFreqData/synthetic_data.csv", sep=",", index=False)

Before imputation: % of zeros:  50.0
After imputation: % of zeros:  0.0
Shape before synthetic data:  (533, 4)
Shape after synthetic data:  (603, 4)


In [850]:
# # t = np.array([[4 ,4 ,0, 0, 4], [0 , 0, 1, 0, 2]])
# a = np.array([[1,0,3,0,4], [3,0,1,0,5]])
# # b = np.argsort(a)
# # # b = b[:, 0:3]
# # print(a, "\n", b)

# len(np.where(a == 0)[0])/(a.shape[0] * a.shape[1])


In [851]:
# train_imag = pd.DataFrame(train_imag)
# print(train_imag.shape)
# train_imag.columns = ["Volt", "Freq", "Zimag"]
# train_imag = pd.concat([train_imag, syn_data], axis=0)
# print(train_imag.shape)
# train_imag.to_csv("/Users/schmuck/Library/CloudStorage/OneDrive-IndianaUniversity/PhD/DSAA_23/data/LowFreqData/synthetic_imag_data.csv", sep=",", index=False)