# HAT
- Histogram Augmentation Technique (HAT) is used widely to augment and classify any tabular data
- HAT is designed such that the generated data retains the distribution of the original tabular data histogram
- HAT analyses the data distribution of a particular feature and based on the feature type (i.e. continuous or discrete) it generates new samples

# Import Libraries 

In [154]:
#IMPORT important libraries
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.model_selection import train_test_split
pd.DataFrame.iteritems = pd.DataFrame.items
import matplotlib.pyplot as plt

# Load Dataset and Split Data

In [157]:
#load dataset
#data = np.loadtxt("uniform_small_d_1.tex")
data = np.loadtxt("uniform_large_d_1.tex")
#data = np.loadtxt("gaussian_small_d_1.tex")
#data = np.loadtxt("gaussian_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,0.604431,0.724054,1.295053,0.495865,0.607451,0.547615,0.56415,0.46888,0.420394,0.910129,...,0.660629,1.325968,1.282151,0.60055,0.592177,0.776711,1.085891,1.153748,1.352572,1.0
1,1.180423,1.391002,1.184481,0.583052,1.21049,0.923676,1.185203,1.369972,1.201448,0.614857,...,0.892705,0.848612,1.298801,1.250497,0.547771,1.215082,0.940952,1.109552,1.181372,1.0
2,1.067779,0.718696,0.798901,1.369462,0.470935,0.566282,1.398846,1.015372,0.801271,1.33027,...,1.339399,0.417466,0.496915,0.661756,0.875185,1.293924,0.750581,0.742218,0.993983,1.0
3,0.368247,0.730771,0.134119,0.984532,0.397524,0.470181,0.025061,0.648142,0.016333,0.973801,...,0.086188,0.394613,0.252668,0.808593,0.587922,0.827502,0.862651,0.684517,0.149873,0.0
4,0.91976,0.577797,0.441661,0.862139,0.263016,0.393494,0.635624,0.657747,0.78192,0.56691,...,0.816635,0.31988,0.770176,0.919029,0.265299,0.983398,0.956898,0.175083,0.170124,0.0


In [159]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(25):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.660629,1.325968,1.282151,0.60055,0.592177,0.776711,1.085891,1.153748,1.352572,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.892705,0.848612,1.298801,1.250497,0.547771,1.215082,0.940952,1.109552,1.181372,1.0
2,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.339399,0.417466,0.496915,0.661756,0.875185,1.293924,0.750581,0.742218,0.993983,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.086188,0.394613,0.252668,0.808593,0.587922,0.827502,0.862651,0.684517,0.149873,0.0
4,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.816635,0.31988,0.770176,0.919029,0.265299,0.983398,0.956898,0.175083,0.170124,0.0


In [161]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [163]:
#save training data (not necessary)
#X_test.to_csv('X_test_XGB.csv', index=False)
#y_test.to_csv('y_test_XGB.csv', index=False)

# HAT code

In [166]:
#define histogram function
#data: This is the original dataset that you want to use for generating new data
#no_new_data: This parameter indicates how much new data you want to generate
#data_feat: This parameter specifies the type of data feature. It can be either 'c' for continuous data or 'd' for discrete data.
#preserve: This parameter determines whether to preserve the original dataset or not while generating new data. If set to True, the original dataset will be included in the generated data; otherwise, it won't be included.
def histogram_sampler(data, no_new_data, data_feat, preserve):

    if (data_feat == 'c'):
        start_time = time.time()
        print("Existing data:", len(data))
        print("New data to be produced:", no_new_data)

#Function parameters
#X_new = New data for each iteration
#len_X_new = length of the newly generated data
#iter_count = number of iterations
#data_gen = Augmented data (original data + newly generated data)
        
        X_new = []
        len_X_new = len(X_new)
        iter_count = 0
        data_gen = data


         # Histogram calculation and sampling logic goes here...
        # Adjust the condition for the while loop
        while (len_X_new < 0.7 * no_new_data):
            iter_count += 1
            print("Number of iterations ---> niter_count=", iter_count)

            
#Histogram: Generating the histogram , choosing the mid-value of the bins, and normalizing frequency
#fd- Freedman–Diaconis rule is employed to choose the bin size, as it depends on the spread of the data, without any presumption
            if (iter_count == 1):
                Y,X_interval=np.histogram(data_gen,bins='doane')
                n_bins = len(Y)
            else:
                Y,X_interval=np.histogram(data_gen,bins=n_bins)

            
            X = ((X_interval[0:-1] + X_interval[1:])/2) 
            Y = Y/max(Y)

            bin_val = list(np.round(X,8))
            weight = list(Y)
            hist = dict(zip(bin_val,weight))

            for xi in bin_val[0:-1]:

#Values: choosing the values for undergoing validity check

                bin_width = ((max(bin_val) - min(bin_val)) / int(len(bin_val)-1))
                xm = xi + (bin_width/2)
                x1 = xi
                y1 = hist[xi]

                res = None
                temp = iter(hist)
                for key in temp:
                    if(key == xi):
                        res = next(temp,None)

                y2 = hist[res]
                ym = ((y1+y2)/2)
#Validity check: checking if the specified value can be considered
#if(no_new_data <= len(data)):
#ym = ym*(np.random.rand()<=ym)
#y1 = y1*(np.random.rand()<=y1)
    
                #else:
                ym = ym*(abs(np.random.normal(0,0.5))<=ym)
                y1 = y1*(abs(np.random.normal(0,0.5))<=y1)

#Appending: appending the valid values
                

                if (ym!=0):
                    X_new.append(np.round(xm,8))
                    #X_new.append(np.round(xm+0.1*xm,8))
                    #X_new.append(np.round(xm-0.1*xm,8))
                if (y1!=0):
                    X_new.append(np.round(x1,8))
                    #X_new.append(np.round(x1+0.1*xm,8))
                    #X_new.append(np.round(x1-0.1*xm,8))
#Stopping: bins * 2, length check

            data_gen = data_gen + X_new
            n_bins = n_bins*2     
            len_X_new+= len(X_new)
            print(len_X_new)
            X_new = []
            print("--- %s seconds ---" % (time.time() - start_time))


        print(len(data_gen)-len(data),no_new_data)

        if(len(data_gen)-len(data) >= no_new_data):
            data_gen = data_gen[:len(data)] + list(np.random.choice(data_gen[len(data):], no_new_data, replace = False))
            print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n')
            #sns.distplot(data_gen)
            if(preserve == False):
                data_gen = data_gen[len(data):]
            return data_gen

        else:
            print('to discrete...', no_new_data - (len(data_gen)-len(data)))
            samples = histogram_sampler(data_gen, no_new_data - (len(data_gen)-len(data)), 'd', preserve = True)
            if(preserve == False):
                samples = samples[len(data):]
            return samples
            
    elif(data_feat == 'd'):
        X_new=[]
        data_gen=[]
        disc_data= list(set(data))

        for i in disc_data: 
            x=data.count(i)
            X_new.append(round(x*(no_new_data) / len(data)))
        #print(x_new,sum(x_new))

        for j in range(0,len(X_new)):
            for i in range(X_new[j]):
                data_gen.append(disc_data[j])

        if(len(data_gen)==0):
            data_gen = data + data_gen

        print(no_new_data, sum(X_new))            
        if(no_new_data > sum(X_new)):
            data_gen = data + data_gen
            data_gen = list(data_gen + list(np.random.choice(data_gen,int(no_new_data-sum(X_new)),replace = False)))    

        data_gen = list(np.random.choice(data_gen,int(no_new_data),replace = False))   
        
        if(preserve == True):
            data_gen = data + data_gen
    
        #sns.distplot(data_gen)     
        print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n') 
        return data_gen
        
    else:
        print('NA')


In [168]:
#df_class : This parameter represents the DataFrame containing the data.
#diff :This parameter specifies the difference or ratio between the number of samples in the smallest and largest classes after splitting
#label_name: It's the column in your DataFrame that you want to use for splitting the data.
#cd: It can take values 'c' for continuous classes or 'd' for discrete classes.
#label_column: It helps the function identify which column contains the labels or classes.
#preserve: If set to `True`, the original dataset will be included in the split data; otherwise, it won't be included.
def label_split(df_class, diff, label_name , cd, label_column, preserve):
    cdi = 0
    df_ = pd.DataFrame(columns=[])
    del df_class[label_column]
    for (columnName, columnData) in df_class.iteritems(): 
        print(columnName)
        feat_type = cd[cdi]
        df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
        cdi+=1
    df_[label_column] = label_name
    print(df_)

    return df_



In [170]:
def class_balance(data, label_column, cd, augment = False, preserve = True):
    split_list=[]
    #label_column = 'species'
    for label, df_label in data.groupby(label_column):
        split_list.append(df_label)

    maxLength = max(len(x) for x in split_list)

    if(augment == False):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = maxLength - len(split_list[i])
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == dict):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = augment[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == int):

        label_count = dict(Counter(list(df[label_column])))
        count_key = list(label_count.keys())
        count_val = list(label_count.values())

        for i in range(0,len(count_val)):
            count_val[i] = round(count_val[i] * augment  /sum(list(label_count.values())))


        while(sum(count_val) != augment):
            if(sum(count_val) > augment):
                rand_indx = int(np.random.rand() * len(count_val))
                if(count_val[rand_indx] > 0):
                    count_val[rand_indx]-= 1

            else:
                rand_indx = int(np.random.rand() * len(count_val))
                count_val[rand_indx]+= 1

        new_count = dict(zip(count_key, count_val))

        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = new_count[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])


        #finaldf = class_balance(data, label_column, cd, augment = new_count, preserve = preserve)


    for i in range(0,len(split_list)):
        finaldf = pd.concat([finaldf,augmented_list[i]],axis=0)

    return finaldf
     

# This is where you begin coding!

In [179]:
#Upload your dataset file
#If the dataset is not imbalanced, you MUST make it
#The program will run and give you a final table creating the new data and time for execution 

# Check if label column is imbalanced assuming label column has an index of 150
print(df_table.iloc[:, 150].value_counts())


150
0.0    250
1.0    250
Name: count, dtype: int64


# Imbalance Dataset

In [141]:
# Assuming df_table is your DataFrame and the label column is at index 150
# Create a new label column with 300 zeros and 200 ones
label1 = np.array([0] * 300 + [1] * 200)

# Shuffle randomly the new label column
np.random.seed(1)
np.random.shuffle(label1)

# Replace the label column with the new imbalanced labels
df_table.iloc[:, 150] = label1

# Optional: Shuffle the entire DataFrame to mix rows
df_imbalance = df_table.sample(frac=1, random_state=1).reset_index(drop=True)

print(df_imbalance.iloc[:, 150].value_counts())

150
0.0    300
1.0    200
Name: count, dtype: int64


In [151]:
# this prints label column
print(df_imbalance.iloc[:, 150])

0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
495    1.0
496    0.0
497    0.0
498    0.0
499    0.0
Name: label, Length: 500, dtype: category
Categories (2, float64): [0.0, 1.0]


In [147]:
#Start Timing the Execution: (do not change this)
start_time = time.time()

#Create a List cd with Specific Values and prints it 
cd = []
for i in range(0,150): #you need to input your number of columns 
    cd.append('c') 
print(cd)

# Optional: Assign a temporary name to the label column --> Only do this if your label column has no label
df_imbalance = df_imbalance.rename(columns={df_imbalance.columns[150]: 'label'})

# Now you can use the renamed column in your class_balance function
a_df = class_balance(df_imbalance, label_column = 'label', cd = cd, augment = False, preserve = True)

# Optional: Print the resulting DataFrame or label counts to verify
print(a_df['label'].value_counts())

#Print the Time Taken for Execution:
print("\n\n\n>>>>>>>>> %s seconds " % (time.time() - start_time))
a_df
     

['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c']
0
Existing data: 300
New data to be produced: 0
0 0

New data generated: 0 
New data: 300 

1
Existing data: 300
New data to be produced: 0
0 0

New data generated: 0 
New data: 300 

2
Existing data: 300
New data to be produced: 0
0 0

New data gen

  for label, df_label in data.groupby(label_column):
  df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
  df_[label_column] = label_name


64
--- 0.01460886001586914 seconds ---
Number of iterations ---> niter_count= 7
78
--- 0.04775500297546387 seconds ---
78 100
to discrete... 22
22 15

New data generated: 22 
New data: 300 

3
Existing data: 200
New data to be produced: 100
Number of iterations ---> niter_count= 1
9
--- 0.0006437301635742188 seconds ---
Number of iterations ---> niter_count= 2
22
--- 0.001138925552368164 seconds ---
Number of iterations ---> niter_count= 3
33
--- 0.0018787384033203125 seconds ---
Number of iterations ---> niter_count= 4
45
--- 0.0033736228942871094 seconds ---
Number of iterations ---> niter_count= 5
62
--- 0.00798177719116211 seconds ---
Number of iterations ---> niter_count= 6
75
--- 0.023119688034057617 seconds ---
75 100
to discrete... 25
25 20

New data generated: 25 
New data: 300 

4
Existing data: 200
New data to be produced: 100
Number of iterations ---> niter_count= 1
7
--- 0.00049591064453125 seconds ---
Number of iterations ---> niter_count= 2
22
--- 0.0009369850158691406 s

  df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
  df_[label_column] = label_name


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,label
0,3.000,3.0,5.0,6.000000,5.0,4.000000,6.000000,4.00,4.000000,3.000000,...,5.501061,3.892058,5.297278,5.155309,6.335189,4.155473,4.933170,4.016113,2.810130,0.0
1,7.000,7.0,6.0,7.000000,8.0,6.000000,6.000000,7.00,7.000000,8.000000,...,7.039630,8.541911,6.195086,5.984047,8.184872,5.740447,6.804127,6.048781,6.375799,0.0
2,5.000,5.0,5.0,5.000000,5.0,4.000000,5.000000,5.00,5.000000,5.000000,...,4.692735,5.187430,6.440273,4.813914,3.919370,4.839028,4.355320,5.503500,6.841564,0.0
3,5.000,6.0,5.0,6.000000,4.0,4.000000,4.000000,6.00,4.000000,6.000000,...,3.856139,3.669938,4.596415,7.273911,5.538512,4.787000,6.080976,5.796605,5.941577,0.0
4,6.000,7.0,7.0,8.000000,7.0,4.000000,7.000000,6.00,8.000000,7.000000,...,6.383598,7.466889,6.753669,7.764210,7.917233,6.439896,6.242661,5.777877,6.879609,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,6.200,6.0,7.0,5.022727,7.0,5.000000,7.003125,7.00,6.111111,8.000000,...,4.534509,4.788662,7.624376,5.678604,4.239313,5.316867,6.101478,4.952728,5.982063,1.0
296,5.125,5.0,4.1,5.000000,6.0,10.000000,6.150000,8.05,7.000000,4.055556,...,4.935469,5.745028,3.028524,4.133725,9.018997,5.666762,7.145435,5.233824,6.077800,1.0
297,6.000,5.0,7.0,5.000000,4.0,6.000000,4.050000,7.00,6.000000,7.013889,...,4.999024,5.747074,7.223611,5.444532,5.639558,6.766430,5.455218,7.163070,3.247736,1.0
298,6.000,3.0,7.0,6.000000,6.0,8.000000,5.000000,7.00,9.000000,4.000000,...,5.163669,7.033302,4.910232,3.759208,7.211305,4.167214,6.200902,6.174444,7.432445,1.0
