# HAT
- Histogram Augmentation Technique (HAT) is used widely to augment and classify any tabular data
- HAT is designed such that the generated data retains the distribution of the original tabular data histogram
- HAT analyses the data distribution of a particular feature and based on the feature type (i.e. continuous or discrete) it generates new samples

# Import Libraries 

In [117]:
#IMPORT important libraries
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.model_selection import train_test_split
pd.DataFrame.iteritems = pd.DataFrame.items
import matplotlib.pyplot as plt

# Load Dataset

In [120]:
#load dataset
#data = np.loadtxt("uniform_small_d_1.tex")
data = np.loadtxt("uniform_large_d_1.tex")
#data = np.loadtxt("gaussian_small_d_1.tex")
#data = np.loadtxt("gaussian_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,0.604431,0.724054,1.295053,0.495865,0.607451,0.547615,0.56415,0.46888,0.420394,0.910129,...,0.660629,1.325968,1.282151,0.60055,0.592177,0.776711,1.085891,1.153748,1.352572,1.0
1,1.180423,1.391002,1.184481,0.583052,1.21049,0.923676,1.185203,1.369972,1.201448,0.614857,...,0.892705,0.848612,1.298801,1.250497,0.547771,1.215082,0.940952,1.109552,1.181372,1.0
2,1.067779,0.718696,0.798901,1.369462,0.470935,0.566282,1.398846,1.015372,0.801271,1.33027,...,1.339399,0.417466,0.496915,0.661756,0.875185,1.293924,0.750581,0.742218,0.993983,1.0
3,0.368247,0.730771,0.134119,0.984532,0.397524,0.470181,0.025061,0.648142,0.016333,0.973801,...,0.086188,0.394613,0.252668,0.808593,0.587922,0.827502,0.862651,0.684517,0.149873,0.0
4,0.91976,0.577797,0.441661,0.862139,0.263016,0.393494,0.635624,0.657747,0.78192,0.56691,...,0.816635,0.31988,0.770176,0.919029,0.265299,0.983398,0.956898,0.175083,0.170124,0.0


# Convert dataset to 'categorical' and 'numerical'

In [123]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(25):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)


# Split Dataset

In [126]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

# Combine X_train and y_train into one dataframe

In [129]:
# Combining X_train and y_train into one DataFrame
train_combined = pd.concat([X_train, y_train], axis=1)
train_combined.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
159,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.58581,0.76664,0.340473,0.350218,0.274646,0.093124,0.0013,0.899399,0.368022,0.0
198,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.600071,1.139251,0.82694,1.272623,1.018323,0.698481,1.365619,0.460799,1.089924,1.0
259,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.488724,1.184376,1.05623,0.478523,0.910375,0.718602,0.753928,0.936283,0.423786,1.0
301,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.528018,0.747393,1.137783,0.521072,1.136169,1.091905,0.533929,1.024574,0.561742,1.0
220,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.271489,0.905137,0.619907,0.835196,0.589189,0.873329,0.891913,0.651581,0.984017,0.0


# HAT code

In [46]:
#define histogram function
#data: This is the original dataset that you want to use for generating new data
#no_new_data: This parameter indicates how much new data you want to generate
#data_feat: This parameter specifies the type of data feature. It can be either 'c' for continuous data or 'd' for discrete data.
#preserve: This parameter determines whether to preserve the original dataset or not while generating new data. If set to True, the original dataset will be included in the generated data; otherwise, it won't be included.
def histogram_sampler(data, no_new_data, data_feat, preserve):

    if (data_feat == 'c'):
        start_time = time.time()
        print("Existing data:", len(data))
        print("New data to be produced:", no_new_data)

#Function parameters
#X_new = New data for each iteration
#len_X_new = length of the newly generated data
#iter_count = number of iterations
#data_gen = Augmented data (original data + newly generated data)
        
        X_new = []
        len_X_new = len(X_new)
        iter_count = 0
        data_gen = data


         # Histogram calculation and sampling logic goes here...
        # Adjust the condition for the while loop
        while (len_X_new < 0.7 * no_new_data):
            iter_count += 1
            print("Number of iterations ---> niter_count=", iter_count)

            
#Histogram: Generating the histogram , choosing the mid-value of the bins, and normalizing frequency
#fd- Freedman–Diaconis rule is employed to choose the bin size, as it depends on the spread of the data, without any presumption
            if (iter_count == 1):
                Y,X_interval=np.histogram(data_gen,bins='doane')
                n_bins = len(Y)
            else:
                Y,X_interval=np.histogram(data_gen,bins=n_bins)

            
            X = ((X_interval[0:-1] + X_interval[1:])/2) 
            Y = Y/max(Y)

            bin_val = list(np.round(X,8))
            weight = list(Y)
            hist = dict(zip(bin_val,weight))

            for xi in bin_val[0:-1]:

#Values: choosing the values for undergoing validity check

                bin_width = ((max(bin_val) - min(bin_val)) / int(len(bin_val)-1))
                xm = xi + (bin_width/2)
                x1 = xi
                y1 = hist[xi]

                res = None
                temp = iter(hist)
                for key in temp:
                    if(key == xi):
                        res = next(temp,None)

                y2 = hist[res]
                ym = ((y1+y2)/2)
#Validity check: checking if the specified value can be considered
#if(no_new_data <= len(data)):
#ym = ym*(np.random.rand()<=ym)
#y1 = y1*(np.random.rand()<=y1)
    
                #else:
                ym = ym*(abs(np.random.normal(0,0.5))<=ym)
                y1 = y1*(abs(np.random.normal(0,0.5))<=y1)

#Appending: appending the valid values
                

                if (ym!=0):
                    X_new.append(np.round(xm,8))
                    #X_new.append(np.round(xm+0.1*xm,8))
                    #X_new.append(np.round(xm-0.1*xm,8))
                if (y1!=0):
                    X_new.append(np.round(x1,8))
                    #X_new.append(np.round(x1+0.1*xm,8))
                    #X_new.append(np.round(x1-0.1*xm,8))
#Stopping: bins * 2, length check

            data_gen = data_gen + X_new
            n_bins = n_bins*2     
            len_X_new+= len(X_new)
            print(len_X_new)
            X_new = []
            print("--- %s seconds ---" % (time.time() - start_time))


        print(len(data_gen)-len(data),no_new_data)

        if(len(data_gen)-len(data) >= no_new_data):
            data_gen = data_gen[:len(data)] + list(np.random.choice(data_gen[len(data):], no_new_data, replace = False))
            print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n')
            #sns.distplot(data_gen)
            if(preserve == False):
                data_gen = data_gen[len(data):]
            return data_gen

        else:
            print('to discrete...', no_new_data - (len(data_gen)-len(data)))
            samples = histogram_sampler(data_gen, no_new_data - (len(data_gen)-len(data)), 'd', preserve = True)
            if(preserve == False):
                samples = samples[len(data):]
            return samples
            
    elif(data_feat == 'd'):
        X_new=[]
        data_gen=[]
        disc_data= list(set(data))

        for i in disc_data: 
            x=data.count(i)
            X_new.append(round(x*(no_new_data) / len(data)))
        #print(x_new,sum(x_new))

        for j in range(0,len(X_new)):
            for i in range(X_new[j]):
                data_gen.append(disc_data[j])

        if(len(data_gen)==0):
            data_gen = data + data_gen

        print(no_new_data, sum(X_new))            
        if(no_new_data > sum(X_new)):
            data_gen = data + data_gen
            data_gen = list(data_gen + list(np.random.choice(data_gen,int(no_new_data-sum(X_new)),replace = False)))    

        data_gen = list(np.random.choice(data_gen,int(no_new_data),replace = False))   
        
        if(preserve == True):
            data_gen = data + data_gen
    
        #sns.distplot(data_gen)     
        print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n') 
        return data_gen
        
    else:
        print('NA')


In [48]:
#df_class : This parameter represents the DataFrame containing the data.
#diff :This parameter specifies the difference or ratio between the number of samples in the smallest and largest classes after splitting
#label_name: It's the column in your DataFrame that you want to use for splitting the data.
#cd: It can take values 'c' for continuous classes or 'd' for discrete classes.
#label_column: It helps the function identify which column contains the labels or classes.
#preserve: If set to `True`, the original dataset will be included in the split data; otherwise, it won't be included.
def label_split(df_class, diff, label_name , cd, label_column, preserve):
    cdi = 0
    df_ = pd.DataFrame(columns=[])
    del df_class[label_column]
    for (columnName, columnData) in df_class.iteritems(): 
        print(columnName)
        feat_type = cd[cdi]
        df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
        cdi+=1
    df_[label_column] = label_name
    print(df_)

    return df_



In [50]:
def class_balance(data, label_column, cd, augment = False, preserve = True):
    split_list=[]
    #label_column = 'species'
    for label, df_label in data.groupby(label_column):
        split_list.append(df_label)

    maxLength = max(len(x) for x in split_list)

    if(augment == False):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = maxLength - len(split_list[i])
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == dict):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = augment[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == int):

        label_count = dict(Counter(list(df[label_column])))
        count_key = list(label_count.keys())
        count_val = list(label_count.values())

        for i in range(0,len(count_val)):
            count_val[i] = round(count_val[i] * augment  /sum(list(label_count.values())))


        while(sum(count_val) != augment):
            if(sum(count_val) > augment):
                rand_indx = int(np.random.rand() * len(count_val))
                if(count_val[rand_indx] > 0):
                    count_val[rand_indx]-= 1

            else:
                rand_indx = int(np.random.rand() * len(count_val))
                count_val[rand_indx]+= 1

        new_count = dict(zip(count_key, count_val))

        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = new_count[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])


        #finaldf = class_balance(data, label_column, cd, augment = new_count, preserve = preserve)


    for i in range(0,len(split_list)):
        finaldf = pd.concat([finaldf,augmented_list[i]],axis=0)

    return finaldf
     

# This is where you begin coding!

In [131]:
#If the dataset is not imbalanced, you MUST make it
# Check if label column is imbalanced assuming label column has an index of 150 (this is the y_train)
print(train_combined.iloc[:, 150].value_counts())


150
0.0    200
1.0    200
Name: count, dtype: int64


# Imbalance Label Column

In [134]:
# Assuming df_table is your DataFrame and the label column is at index 150
# Create a new label column with 300 zeros and 200 ones
label1 = np.array([0] * 250 + [1] * 150)

# Shuffle randomly the new label column
np.random.seed(1)
np.random.shuffle(label1)

# Replace the label column with the new imbalanced labels
train_combined['label_0_majority'] = label1
print(train_combined['label_0_majority'].value_counts())

# checks column labels. Categorical are 'int', numerical are 'floats'
print(train_combined['label_0_majority'].dtype)

train_combined.head()

label_0_majority
0    250
1    150
Name: count, dtype: int64
int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,142,143,144,145,146,147,148,149,150,label_0_majority
159,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.76664,0.340473,0.350218,0.274646,0.093124,0.0013,0.899399,0.368022,0.0,1
198,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.139251,0.82694,1.272623,1.018323,0.698481,1.365619,0.460799,1.089924,1.0,0
259,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,1.184376,1.05623,0.478523,0.910375,0.718602,0.753928,0.936283,0.423786,1.0,1
301,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.747393,1.137783,0.521072,1.136169,1.091905,0.533929,1.024574,0.561742,1.0,1
220,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.905137,0.619907,0.835196,0.589189,0.873329,0.891913,0.651581,0.984017,0.0,0


# Imbalance Label column again

In [137]:
# Create a second imbalanced label column with 200 zeros and 300 ones
label2 = np.array([0] * 150 + [1] * 250)

# Shuffle randomly the new label column
np.random.seed(1)
np.random.shuffle(label2)

# Add the new label column to the DataFrame
train_combined['label_1_majority'] = label2

print(train_combined['label_1_majority'].value_counts())

label_1_majority
1    250
0    150
Name: count, dtype: int64


In [139]:
# drop the y_train column and create 2 new dataframes for label_0_majority and label_1_majority
train_combined = train_combined.drop(columns=[150])
train_combined.head()

# dataframe for 0 label column
df_train_0 = train_combined.iloc[:,0:151]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,142,143,144,145,146,147,148,149,label_0_majority,label_1_majority
159,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.76664,0.340473,0.350218,0.274646,0.093124,0.0013,0.899399,0.368022,1,1
198,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.139251,0.82694,1.272623,1.018323,0.698481,1.365619,0.460799,1.089924,0,0
259,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,1.184376,1.05623,0.478523,0.910375,0.718602,0.753928,0.936283,0.423786,1,1
301,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.747393,1.137783,0.521072,1.136169,1.091905,0.533929,1.024574,0.561742,1,1
220,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.905137,0.619907,0.835196,0.589189,0.873329,0.891913,0.651581,0.984017,0,1


In [None]:
# Drop the column at index 150
df_train1 = train_combined.drop(train_combined.columns[150], axis=1)
df_train1.head()

In [None]:
#Start Timing the Execution: (do not change this)
start_time = time.time()

#Create a List cd with Specific Values and prints it 
cd = []
for i in range(0,150): #you need to input your number of columns 
    cd.append('c') 
print(cd)

# Now you can use the renamed column in your class_balance function
a_df = class_balance(df_table_dropped1, label_column = 'label1', cd = cd, augment = False, preserve = True)


#Print the Time Taken for Execution:
print("\n\n\n>>>>>>>>> %s seconds " % (time.time() - start_time))

# Print new augmented dataframe
print("----------------------------- Augmented DataFrame -------------------------\n")
a_df

# keep the train combined for 500 rows and 150 columns and then add 100 new rows from label1 and 100 new rows from label2
# make this into a new csv file called testing with 700 rows  

['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c']
0
Existing data: 300
New data to be produced: 0
0 0

New data generated: 0 
New data: 300 

1
Existing data: 300
New data to be produced: 0
0 0

New data generated: 0 
New data: 300 

2
Existing data: 300
New data to be produced: 0
0 0

New data gen

  df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
  df_[label_column] = label_name


16
--- 0.21843981742858887 seconds ---
Number of iterations ---> niter_count= 9
18
--- 0.8281509876251221 seconds ---
Number of iterations ---> niter_count= 10
20
--- 3.1219677925109863 seconds ---
Number of iterations ---> niter_count= 11
22
--- 12.72246789932251 seconds ---
Number of iterations ---> niter_count= 12
24
--- 50.0920889377594 seconds ---
Number of iterations ---> niter_count= 13
27
--- 201.2232527732849 seconds ---
Number of iterations ---> niter_count= 14


In [None]:
#save training data (not necessary)
#X_test.to_csv('X_test_XGB.csv', index=False)
#y_test.to_csv('y_test_XGB.csv', index=False)