In [542]:
#HAT Histogram Augmentation Technique (HAT) is used widely to augment and classify any tabular data
#HAT is designed such that the generated data retains the distribution of the original tabular data histogram
#HATanalyses the data distribution of a particular feature and based on the feature type (i.e. continuous or discrete) it generates new samples

In [544]:
#IMPORT important libraries
import numpy as np
import pandas as pd
import time
from collections import Counter
pd.DataFrame.iteritems = pd.DataFrame.items

In [546]:
#define histogram function
#data: This is the original dataset that you want to use for generating new data
#no_new_data: This parameter indicates how much new data you want to generate
#data_feat: This parameter specifies the type of data feature. It can be either 'c' for continuous data or 'd' for discrete data.
#preserve: This parameter determines whether to preserve the original dataset or not while generating new data. If set to True, the original dataset will be included in the generated data; otherwise, it won't be included.
def histogram_sampler(data, no_new_data, data_feat, preserve):

    if (data_feat == 'c'):
        start_time = time.time()
        print("Existing data:", len(data))
        print("New data to be produced:", no_new_data)

#Function parameters
#X_new = New data for each iteration
#len_X_new = length of the newly generated data
#iter_count = number of iterations
#data_gen = Augmented data (original data + newly generated data)
        
        X_new = []
        len_X_new = len(X_new)
        iter_count = 0
        data_gen = data


         # Histogram calculation and sampling logic goes here...
        # Adjust the condition for the while loop
        while (len_X_new < 0.7 * no_new_data):
            iter_count += 1
            print("Number of iterations ---> niter_count=", iter_count)

            
#Histogram: Generating the histogram , choosing the mid-value of the bins, and normalizing frequency
#fd- Freedman–Diaconis rule is employed to choose the bin size, as it depends on the spread of the data, without any presumption
            if (iter_count == 1):
                Y,X_interval=np.histogram(data_gen,bins='doane')
                n_bins = len(Y)
            else:
                Y,X_interval=np.histogram(data_gen,bins=n_bins)

            
            X = ((X_interval[0:-1] + X_interval[1:])/2) 
            Y = Y/max(Y)

            bin_val = list(np.round(X,8))
            weight = list(Y)
            hist = dict(zip(bin_val,weight))

            for xi in bin_val[0:-1]:

#Values: choosing the values for undergoing validity check

                bin_width = ((max(bin_val) - min(bin_val)) / int(len(bin_val)-1))
                xm = xi + (bin_width/2)
                x1 = xi
                y1 = hist[xi]

                res = None
                temp = iter(hist)
                for key in temp:
                    if(key == xi):
                        res = next(temp,None)

                y2 = hist[res]
                ym = ((y1+y2)/2)
#Validity check: checking if the specified value can be considered
#if(no_new_data <= len(data)):
#ym = ym*(np.random.rand()<=ym)
#y1 = y1*(np.random.rand()<=y1)
    
                #else:
                ym = ym*(abs(np.random.normal(0,0.5))<=ym)
                y1 = y1*(abs(np.random.normal(0,0.5))<=y1)

#Appending: appending the valid values
                

                if (ym!=0):
                    X_new.append(np.round(xm,8))
                    #X_new.append(np.round(xm+0.1*xm,8))
                    #X_new.append(np.round(xm-0.1*xm,8))
                if (y1!=0):
                    X_new.append(np.round(x1,8))
                    #X_new.append(np.round(x1+0.1*xm,8))
                    #X_new.append(np.round(x1-0.1*xm,8))
#Stopping: bins * 2, length check

            data_gen = data_gen + X_new
            n_bins = n_bins*2     
            len_X_new+= len(X_new)
            print(len_X_new)
            X_new = []
            print("--- %s seconds ---" % (time.time() - start_time))


        print(len(data_gen)-len(data),no_new_data)

        if(len(data_gen)-len(data) >= no_new_data):
            data_gen = data_gen[:len(data)] + list(np.random.choice(data_gen[len(data):], no_new_data, replace = False))
            print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n')
            #sns.distplot(data_gen)
            if(preserve == False):
                data_gen = data_gen[len(data):]
            return data_gen

        else:
            print('to discrete...', no_new_data - (len(data_gen)-len(data)))
            samples = histogram_sampler(data_gen, no_new_data - (len(data_gen)-len(data)), 'd', preserve = True)
            if(preserve == False):
                samples = samples[len(data):]
            return samples
            
    elif(data_feat == 'd'):
        X_new=[]
        data_gen=[]
        disc_data= list(set(data))

        for i in disc_data: 
            x=data.count(i)
            X_new.append(round(x*(no_new_data) / len(data)))
        #print(x_new,sum(x_new))

        for j in range(0,len(X_new)):
            for i in range(X_new[j]):
                data_gen.append(disc_data[j])

        if(len(data_gen)==0):
            data_gen = data + data_gen

        print(no_new_data, sum(X_new))            
        if(no_new_data > sum(X_new)):
            data_gen = data + data_gen
            data_gen = list(data_gen + list(np.random.choice(data_gen,int(no_new_data-sum(X_new)),replace = False)))    

        data_gen = list(np.random.choice(data_gen,int(no_new_data),replace = False))   
        
        if(preserve == True):
            data_gen = data + data_gen
    
        #sns.distplot(data_gen)     
        print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n') 
        return data_gen
        
    else:
        print('NA')


In [548]:
#df_class : This parameter represents the DataFrame containing the data.
#diff :This parameter specifies the difference or ratio between the number of samples in the smallest and largest classes after splitting
#label_name: It's the column in your DataFrame that you want to use for splitting the data.
#cd: It can take values 'c' for continuous classes or 'd' for discrete classes.
#label_column: It helps the function identify which column contains the labels or classes.
#preserve: If set to `True`, the original dataset will be included in the split data; otherwise, it won't be included.
def label_split(df_class, diff, label_name , cd, label_column, preserve):
    cdi = 0
    df_ = pd.DataFrame(columns=[])
    del df_class[label_column]
    for (columnName, columnData) in df_class.iteritems(): 
        print(columnName)
        feat_type = cd[cdi]
        df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
        cdi+=1
    df_[label_column] = label_name
    print(df_)

    return df_



In [550]:
def class_balance(data, label_column, cd, augment = False, preserve = True):
    split_list=[]
    #label_column = 'species'
    for label, df_label in data.groupby(label_column):
        split_list.append(df_label)

    maxLength = max(len(x) for x in split_list)

    if(augment == False):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = maxLength - len(split_list[i])
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == dict):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = augment[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == int):

        label_count = dict(Counter(list(df[label_column])))
        count_key = list(label_count.keys())
        count_val = list(label_count.values())

        for i in range(0,len(count_val)):
            count_val[i] = round(count_val[i] * augment  /sum(list(label_count.values())))


        while(sum(count_val) != augment):
            if(sum(count_val) > augment):
                rand_indx = int(np.random.rand() * len(count_val))
                if(count_val[rand_indx] > 0):
                    count_val[rand_indx]-= 1

            else:
                rand_indx = int(np.random.rand() * len(count_val))
                count_val[rand_indx]+= 1

        new_count = dict(zip(count_key, count_val))

        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = new_count[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])


        #finaldf = class_balance(data, label_column, cd, augment = new_count, preserve = preserve)


    for i in range(0,len(split_list)):
        finaldf = pd.concat([finaldf,augmented_list[i]],axis=0)

    return finaldf
     

In [552]:
#This is where you begin coding!
#Upload your dataset file
#If this dataset is not inbalanced, you MUST make it
#The program will run and give you a final table creating the new data and time for execution 
df = pd.read_csv('IRIS.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [554]:
#You need to make the data inbalanced
df_unbalanced = df.head(70).copy()

# If the last column "label column" in your data set is not in form of "INT", you must change it 
# Replace species names (label column) with numbers
# Map the species column as specified
species_mapping = {
    'Iris-setosa': 1,
    'Iris-versicolor': 2,
    'Iris-virginica': 3
}
df_unbalanced['species'] = df_unbalanced['species'].map(species_mapping)

# Save the modified dataset to a new CSV file
df_unbalanced.to_csv('Iris_unbalanced.csv', index=False)

print("Unbalanced dataset saved as Iris_unbalanced.csv")
print(df_unbalanced)


Unbalanced dataset saved as Iris_unbalanced.csv
    sepal_length  sepal_width  petal_length  petal_width  species
0            5.1          3.5           1.4          0.2        1
1            4.9          3.0           1.4          0.2        1
2            4.7          3.2           1.3          0.2        1
3            4.6          3.1           1.5          0.2        1
4            5.0          3.6           1.4          0.2        1
..           ...          ...           ...          ...      ...
65           6.7          3.1           4.4          1.4        2
66           5.6          3.0           4.5          1.5        2
67           5.8          2.7           4.1          1.0        2
68           6.2          2.2           4.5          1.5        2
69           5.6          2.5           3.9          1.1        2

[70 rows x 5 columns]


In [556]:
#used to display the count of unique values in the column 'species'
print(df_unbalanced['species'].value_counts())

species
1    50
2    20
Name: count, dtype: int64


In [558]:
#Start Timing the Execution:
start_time = time.time()

#Create a List cd with Specific Values and prints it
cd = []
for i in range(0,4):
    cd.append('c')
print(cd)

#Balance the Classes in the DataFrame:
a_df = class_balance(df_unbalanced, label_column = 'species', cd = cd, augment = False, preserve = True)

#Print the Time Taken for Execution:
print("\n\n\n>>>>>>>>> %s seconds " % (time.time() - start_time))
a_df
     

['c', 'c', 'c', 'c']
sepal_length
Existing data: 50
New data to be produced: 0
0 0

New data generated: 0 
New data: 50 

sepal_width
Existing data: 50
New data to be produced: 0
0 0

New data generated: 0 
New data: 50 

petal_length
Existing data: 50
New data to be produced: 0
0 0

New data generated: 0 
New data: 50 

petal_width
Existing data: 50
New data to be produced: 0
0 0

New data generated: 0 
New data: 50 

    sepal_length  sepal_width  petal_length  petal_width  species
0            5.1          3.5           1.4          0.2        1
1            4.9          3.0           1.4          0.2        1
2            4.7          3.2           1.3          0.2        1
3            4.6          3.1           1.5          0.2        1
4            5.0          3.6           1.4          0.2        1
5            5.4          3.9           1.7          0.4        1
6            4.6          3.4           1.4          0.3        1
7            5.0          3.4           1.5      

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1000,3.500000,1.400000,0.200000,1
1,4.9000,3.000000,1.400000,0.200000,1
2,4.7000,3.200000,1.300000,0.200000,1
3,4.6000,3.100000,1.500000,0.200000,1
4,5.0000,3.600000,1.400000,0.200000,1
...,...,...,...,...,...
45,5.4250,2.800000,4.100000,1.471429,2
46,6.3875,2.557143,4.500000,1.021429,2
47,5.6000,2.835714,3.414286,1.107143,2
48,5.2500,3.067857,4.728571,1.492857,2
