# HAT
- Histogram Augmentation Technique (HAT) is used widely to augment and classify any tabular data
- HAT is designed such that the generated data retains the distribution of the original tabular data histogram
- HAT analyses the data distribution of a particular feature and based on the feature type (i.e. continuous or discrete) it generates new samples

# Import Libraries 

In [None]:
#IMPORT important libraries
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.model_selection import train_test_split
pd.DataFrame.iteritems = pd.DataFrame.items
import matplotlib.pyplot as plt

# Load Dataset

In [None]:
#load dataset
#data = np.loadtxt("uniform_small_d_1.tex")
data = np.loadtxt("uniform_large_d_1.tex")
#data = np.loadtxt("gaussian_small_d_1.tex")
#data = np.loadtxt("gaussian_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
df_table.head()

# Convert dataset to 'categorical' and 'numerical'

In [None]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(25):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()


# Split Dataset

In [None]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [None]:
#save training data (not necessary)
#X_test.to_csv('X_test_XGB.csv', index=False)
#y_test.to_csv('y_test_XGB.csv', index=False)

# HAT code

In [None]:
#define histogram function
#data: This is the original dataset that you want to use for generating new data
#no_new_data: This parameter indicates how much new data you want to generate
#data_feat: This parameter specifies the type of data feature. It can be either 'c' for continuous data or 'd' for discrete data.
#preserve: This parameter determines whether to preserve the original dataset or not while generating new data. If set to True, the original dataset will be included in the generated data; otherwise, it won't be included.
def histogram_sampler(data, no_new_data, data_feat, preserve):

    if (data_feat == 'c'):
        start_time = time.time()
        print("Existing data:", len(data))
        print("New data to be produced:", no_new_data)

#Function parameters
#X_new = New data for each iteration
#len_X_new = length of the newly generated data
#iter_count = number of iterations
#data_gen = Augmented data (original data + newly generated data)
        
        X_new = []
        len_X_new = len(X_new)
        iter_count = 0
        data_gen = data


         # Histogram calculation and sampling logic goes here...
        # Adjust the condition for the while loop
        while (len_X_new < 0.7 * no_new_data):
            iter_count += 1
            print("Number of iterations ---> niter_count=", iter_count)

            
#Histogram: Generating the histogram , choosing the mid-value of the bins, and normalizing frequency
#fd- Freedman–Diaconis rule is employed to choose the bin size, as it depends on the spread of the data, without any presumption
            if (iter_count == 1):
                Y,X_interval=np.histogram(data_gen,bins='doane')
                n_bins = len(Y)
            else:
                Y,X_interval=np.histogram(data_gen,bins=n_bins)

            
            X = ((X_interval[0:-1] + X_interval[1:])/2) 
            Y = Y/max(Y)

            bin_val = list(np.round(X,8))
            weight = list(Y)
            hist = dict(zip(bin_val,weight))

            for xi in bin_val[0:-1]:

#Values: choosing the values for undergoing validity check

                bin_width = ((max(bin_val) - min(bin_val)) / int(len(bin_val)-1))
                xm = xi + (bin_width/2)
                x1 = xi
                y1 = hist[xi]

                res = None
                temp = iter(hist)
                for key in temp:
                    if(key == xi):
                        res = next(temp,None)

                y2 = hist[res]
                ym = ((y1+y2)/2)
#Validity check: checking if the specified value can be considered
#if(no_new_data <= len(data)):
#ym = ym*(np.random.rand()<=ym)
#y1 = y1*(np.random.rand()<=y1)
    
                #else:
                ym = ym*(abs(np.random.normal(0,0.5))<=ym)
                y1 = y1*(abs(np.random.normal(0,0.5))<=y1)

#Appending: appending the valid values
                

                if (ym!=0):
                    X_new.append(np.round(xm,8))
                    #X_new.append(np.round(xm+0.1*xm,8))
                    #X_new.append(np.round(xm-0.1*xm,8))
                if (y1!=0):
                    X_new.append(np.round(x1,8))
                    #X_new.append(np.round(x1+0.1*xm,8))
                    #X_new.append(np.round(x1-0.1*xm,8))
#Stopping: bins * 2, length check

            data_gen = data_gen + X_new
            n_bins = n_bins*2     
            len_X_new+= len(X_new)
            print(len_X_new)
            X_new = []
            print("--- %s seconds ---" % (time.time() - start_time))


        print(len(data_gen)-len(data),no_new_data)

        if(len(data_gen)-len(data) >= no_new_data):
            data_gen = data_gen[:len(data)] + list(np.random.choice(data_gen[len(data):], no_new_data, replace = False))
            print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n')
            #sns.distplot(data_gen)
            if(preserve == False):
                data_gen = data_gen[len(data):]
            return data_gen

        else:
            print('to discrete...', no_new_data - (len(data_gen)-len(data)))
            samples = histogram_sampler(data_gen, no_new_data - (len(data_gen)-len(data)), 'd', preserve = True)
            if(preserve == False):
                samples = samples[len(data):]
            return samples
            
    elif(data_feat == 'd'):
        X_new=[]
        data_gen=[]
        disc_data= list(set(data))

        for i in disc_data: 
            x=data.count(i)
            X_new.append(round(x*(no_new_data) / len(data)))
        #print(x_new,sum(x_new))

        for j in range(0,len(X_new)):
            for i in range(X_new[j]):
                data_gen.append(disc_data[j])

        if(len(data_gen)==0):
            data_gen = data + data_gen

        print(no_new_data, sum(X_new))            
        if(no_new_data > sum(X_new)):
            data_gen = data + data_gen
            data_gen = list(data_gen + list(np.random.choice(data_gen,int(no_new_data-sum(X_new)),replace = False)))    

        data_gen = list(np.random.choice(data_gen,int(no_new_data),replace = False))   
        
        if(preserve == True):
            data_gen = data + data_gen
    
        #sns.distplot(data_gen)     
        print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n') 
        return data_gen
        
    else:
        print('NA')


In [None]:
#df_class : This parameter represents the DataFrame containing the data.
#diff :This parameter specifies the difference or ratio between the number of samples in the smallest and largest classes after splitting
#label_name: It's the column in your DataFrame that you want to use for splitting the data.
#cd: It can take values 'c' for continuous classes or 'd' for discrete classes.
#label_column: It helps the function identify which column contains the labels or classes.
#preserve: If set to `True`, the original dataset will be included in the split data; otherwise, it won't be included.
def label_split(df_class, diff, label_name , cd, label_column, preserve):
    cdi = 0
    df_ = pd.DataFrame(columns=[])
    del df_class[label_column]
    for (columnName, columnData) in df_class.iteritems(): 
        print(columnName)
        feat_type = cd[cdi]
        df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
        cdi+=1
    df_[label_column] = label_name
    print(df_)

    return df_



In [None]:
def class_balance(data, label_column, cd, augment = False, preserve = True):
    split_list=[]
    #label_column = 'species'
    for label, df_label in data.groupby(label_column):
        split_list.append(df_label)

    maxLength = max(len(x) for x in split_list)

    if(augment == False):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = maxLength - len(split_list[i])
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == dict):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = augment[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == int):

        label_count = dict(Counter(list(df[label_column])))
        count_key = list(label_count.keys())
        count_val = list(label_count.values())

        for i in range(0,len(count_val)):
            count_val[i] = round(count_val[i] * augment  /sum(list(label_count.values())))


        while(sum(count_val) != augment):
            if(sum(count_val) > augment):
                rand_indx = int(np.random.rand() * len(count_val))
                if(count_val[rand_indx] > 0):
                    count_val[rand_indx]-= 1

            else:
                rand_indx = int(np.random.rand() * len(count_val))
                count_val[rand_indx]+= 1

        new_count = dict(zip(count_key, count_val))

        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = new_count[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])


        #finaldf = class_balance(data, label_column, cd, augment = new_count, preserve = preserve)


    for i in range(0,len(split_list)):
        finaldf = pd.concat([finaldf,augmented_list[i]],axis=0)

    return finaldf
     

# This is where you begin coding!

In [None]:
#Upload your dataset file
#If the dataset is not imbalanced, you MUST make it
#The program will run and give you a final table creating the new data and time for execution 

# Check if label column is imbalanced assuming label column has an index of 150
print(df_table.iloc[:, 150].value_counts())


# Imbalance Label Column

In [None]:
# Assuming df_table is your DataFrame and the label column is at index 150
# Create a new label column with 300 zeros and 200 ones
label1 = np.array([0] * 300 + [1] * 200)

# Shuffle randomly the new label column
np.random.seed(1)
np.random.shuffle(label1)

# Replace the label column with the new imbalanced labels
df_table.iloc[:, 150] = label1

# Optional: Shuffle the entire DataFrame to mix rows
df_imbalance = df_table.sample(frac=1, random_state=1).reset_index(drop=True)

print(df_imbalance.iloc[:, 150].value_counts())

#print(df_table.iloc[:, 150]) #this prints label column


# Imbalance Label column again

In [None]:
# Create a second imbalanced label column with 200 zeros and 300 ones
label_column = np.array([0] * 200 + [1] * 300)

# Shuffle randomly the new label column
np.random.seed(1)
np.random.shuffle(label_column)

# Add the new label column to the DataFrame
df_imbalance['label_column'] = label_column

print(df_imbalance['label_column'].value_counts())

In [None]:
#Start Timing the Execution: (do not change this)
start_time = time.time()

#Create a List cd with Specific Values and prints it 
cd = []
for i in range(0,151): #you need to input your number of columns 
    cd.append('c') 
print(cd)

# Optional: Assign a temporary name to the label column --> Only do this if your label column has no label
df_imbalance = df_imbalance.rename(columns={df_imbalance.columns[150]: 'label'})

# Now you can use the renamed column in your class_balance function
a_df = class_balance(df_imbalance, label_column = 'label', cd = cd, augment = False, preserve = True)

# Optional: Print the resulting DataFrame or label counts to verify
print(a_df['label'].value_counts())

#Print the Time Taken for Execution:
print("\n\n\n>>>>>>>>> %s seconds " % (time.time() - start_time))
a_df
     