# HAT
- Histogram Augmentation Technique (HAT) is used widely to augment and classify any tabular data
- HAT is designed such that the generated data retains the distribution of the original tabular data histogram
- HAT analyses the data distribution of a particular feature and based on the feature type (i.e. continuous or discrete) it generates new samples

# Import Libraries 

In [30]:
#IMPORT important libraries
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.model_selection import train_test_split
pd.DataFrame.iteritems = pd.DataFrame.items

# Load Dataset

In [33]:
data1 = pd.read_csv("/Users/fabianafazio/Documents/GitHub/BP24/Ellee/Data/Stacked/stacked_orig.csv", header=None)

for column in data1.columns[-9:]:
    data1[column] = data1[column].astype('category')

# Verify the changes
print(data1.dtypes)

# data1
X = data1.iloc[:, :-1]
y = data1.iloc[:, -1]


# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Combining X_train and y_train into one DataFrame
train_combined = pd.concat([X_train, y_train], axis=1)
train_combined

0      float64
1      float64
2      float64
3      float64
4      float64
5      float64
6      float64
7      float64
8      float64
9      float64
10     float64
11     float64
12     float64
13     float64
14     float64
15     float64
16    category
17    category
18    category
19    category
20    category
21    category
22    category
23    category
24    category
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
139,2.475630,4.509819,2.698743,0.735190,0.298927,0.733570,-0.401217,2.891989,0.398573,0.309395,...,0.430840,3.0,1.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0
60,0.071487,0.021725,0.513916,0.389062,1.745885,0.878669,1.122335,0.820908,0.205372,0.373203,...,0.235020,2.0,0.0,0.0,2.0,2.0,0.0,2.0,1.0,0.0
204,2.968174,0.157638,0.687278,-0.417489,-1.123093,2.150422,1.684241,1.521522,0.400247,0.165305,...,0.363965,0.0,0.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0
186,2.371169,0.906434,2.989207,1.185898,1.895433,1.591490,0.015655,2.669418,0.261084,0.194195,...,0.362731,2.0,0.0,1.0,0.0,1.0,3.0,0.0,1.0,1.0
207,1.188074,-0.604477,1.692414,2.779292,5.630710,2.470652,2.391655,2.526611,0.435726,0.239847,...,0.202225,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-1.902237,4.093622,1.455013,0.745927,1.959397,2.148542,0.823208,0.640636,0.208701,0.211725,...,0.418545,0.0,1.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0
14,-0.041787,0.598329,0.457436,0.946191,0.489044,0.508848,0.633938,0.888166,0.017482,0.241606,...,0.149555,1.0,3.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0
92,0.217072,2.343969,2.855842,1.467715,3.655893,2.827516,1.309079,-0.802491,0.464559,0.449139,...,0.448680,2.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
179,2.021847,0.696105,2.746981,3.661601,1.925047,0.421424,-0.254105,1.765419,0.353653,0.198970,...,0.180303,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0


# HAT code

In [9]:
#define histogram function
#data: This is the original dataset that you want to use for generating new data
#no_new_data: This parameter indicates how much new data you want to generate
#data_feat: This parameter specifies the type of data feature. It can be either 'c' for continuous data or 'd' for discrete data.
#preserve: This parameter determines whether to preserve the original dataset or not while generating new data. If set to True, the original dataset will be included in the generated data; otherwise, it won't be included.
def histogram_sampler(data, no_new_data, data_feat, preserve):

    if (data_feat == 'c'):
        start_time = time.time()
        print("Existing data:", len(data))
        print("New data to be produced:", no_new_data)

#Function parameters
#X_new = New data for each iteration
#len_X_new = length of the newly generated data
#iter_count = number of iterations
#data_gen = Augmented data (original data + newly generated data)
        
        X_new = []
        len_X_new = len(X_new)
        iter_count = 0
        data_gen = data


         # Histogram calculation and sampling logic goes here...
        # Adjust the condition for the while loop
        while (len_X_new < 0.7 * no_new_data):
            iter_count += 1
            print("Number of iterations ---> niter_count=", iter_count)

            
#Histogram: Generating the histogram , choosing the mid-value of the bins, and normalizing frequency
#fd- Freedman–Diaconis rule is employed to choose the bin size, as it depends on the spread of the data, without any presumption
            if (iter_count == 1):
                Y,X_interval=np.histogram(data_gen,bins='doane')
                n_bins = len(Y)
            else:
                Y,X_interval=np.histogram(data_gen,bins=n_bins)

            
            X = ((X_interval[0:-1] + X_interval[1:])/2) 
            Y = Y/max(Y)

            bin_val = list(np.round(X,8))
            weight = list(Y)
            hist = dict(zip(bin_val,weight))

            for xi in bin_val[0:-1]:

#Values: choosing the values for undergoing validity check

                bin_width = ((max(bin_val) - min(bin_val)) / int(len(bin_val)-1))
                xm = xi + (bin_width/2)
                x1 = xi
                y1 = hist[xi]

                res = None
                temp = iter(hist)
                for key in temp:
                    if(key == xi):
                        res = next(temp,None)

                y2 = hist[res]
                ym = ((y1+y2)/2)
#Validity check: checking if the specified value can be considered
#if(no_new_data <= len(data)):
#ym = ym*(np.random.rand()<=ym)
#y1 = y1*(np.random.rand()<=y1)
    
                #else:
                ym = ym*(abs(np.random.normal(0,0.5))<=ym)
                y1 = y1*(abs(np.random.normal(0,0.5))<=y1)

#Appending: appending the valid values
                

                if (ym!=0):
                    X_new.append(np.round(xm,8))
                    #X_new.append(np.round(xm+0.1*xm,8))
                    #X_new.append(np.round(xm-0.1*xm,8))
                if (y1!=0):
                    X_new.append(np.round(x1,8))
                    #X_new.append(np.round(x1+0.1*xm,8))
                    #X_new.append(np.round(x1-0.1*xm,8))
#Stopping: bins * 2, length check

            data_gen = data_gen + X_new
            n_bins = n_bins*2     
            len_X_new+= len(X_new)
            print(len_X_new)
            X_new = []
            print("--- %s seconds ---" % (time.time() - start_time))


        print(len(data_gen)-len(data),no_new_data)

        if(len(data_gen)-len(data) >= no_new_data):
            data_gen = data_gen[:len(data)] + list(np.random.choice(data_gen[len(data):], no_new_data, replace = False))
            print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n')
            #sns.distplot(data_gen)
            if(preserve == False):
                data_gen = data_gen[len(data):]
            return data_gen

        else:
            print('to discrete...', no_new_data - (len(data_gen)-len(data)))
            samples = histogram_sampler(data_gen, no_new_data - (len(data_gen)-len(data)), 'd', preserve = True)
            if(preserve == False):
                samples = samples[len(data):]
            return samples
            
    elif(data_feat == 'd'):
        X_new=[]
        data_gen=[]
        disc_data= list(set(data))

        for i in disc_data: 
            x=data.count(i)
            X_new.append(round(x*(no_new_data) / len(data)))
        #print(x_new,sum(x_new))

        for j in range(0,len(X_new)):
            for i in range(X_new[j]):
                data_gen.append(disc_data[j])

        if(len(data_gen)==0):
            data_gen = data + data_gen

        print(no_new_data, sum(X_new))            
        if(no_new_data > sum(X_new)):
            data_gen = data + data_gen
            data_gen = list(data_gen + list(np.random.choice(data_gen,int(no_new_data-sum(X_new)),replace = False)))    

        data_gen = list(np.random.choice(data_gen,int(no_new_data),replace = False))   
        
        if(preserve == True):
            data_gen = data + data_gen
    
        #sns.distplot(data_gen)     
        print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n') 
        return data_gen
        
    else:
        print('NA')


In [11]:
#df_class : This parameter represents the DataFrame containing the data.
#diff :This parameter specifies the difference or ratio between the number of samples in the smallest and largest classes after splitting
#label_name: It's the column in your DataFrame that you want to use for splitting the data.
#cd: It can take values 'c' for continuous classes or 'd' for discrete classes.
#label_column: It helps the function identify which column contains the labels or classes.
#preserve: If set to `True`, the original dataset will be included in the split data; otherwise, it won't be included.
def label_split(df_class, diff, label_name , cd, label_column, preserve):
    cdi = 0
    columns_data ={}
    
    df_ = pd.DataFrame(columns=[])
    del df_class[label_column]
    
    for (columnName, columnData) in df_class.iteritems(): 
        print(columnName)
        feat_type = cd[cdi]
        df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
        cdi+=1
    
    df_[label_column] = label_name
    print(df_)

    return df_




In [13]:
def class_balance(data, label_column, cd, augment = False, preserve = True):
    split_list=[]
    #label_column = 'species'
    for label, df_label in data.groupby(label_column):
        split_list.append(df_label)

    maxLength = max(len(x) for x in split_list)

    if(augment == False):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = maxLength - len(split_list[i])
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == dict):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = augment[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == int):

        label_count = dict(Counter(list(df[label_column])))
        count_key = list(label_count.keys())
        count_val = list(label_count.values())

        for i in range(0,len(count_val)):
            count_val[i] = round(count_val[i] * augment  /sum(list(label_count.values())))


        while(sum(count_val) != augment):
            if(sum(count_val) > augment):
                rand_indx = int(np.random.rand() * len(count_val))
                if(count_val[rand_indx] > 0):
                    count_val[rand_indx]-= 1

            else:
                rand_indx = int(np.random.rand() * len(count_val))
                count_val[rand_indx]+= 1

        new_count = dict(zip(count_key, count_val))

        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = new_count[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])


        #finaldf = class_balance(data, label_column, cd, augment = new_count, preserve = preserve)


    for i in range(0,len(split_list)):
        finaldf = pd.concat([finaldf,augmented_list[i]],axis=0)

    return finaldf
     

# This is where you begin coding!
- Below: Only change --> for i in range(0, 24):
- 24 is the number of columns in your dataset

# Run HAT Augmentation

In [25]:
# Start Timing the Execution: (do not change this)
start_time = time.time()

# Create a List cd with Specific Values and prints it
cd = []
for i in range(0, 24):  # you need to input your number of columns
    cd.append('c')
print(cd)

# Assuming train_combined is your dataframe
label_column_index = 24  # Index of the label column
label_column_name = train_combined.columns[label_column_index]

# Function to augment the data
def augment_data(a_df, label_column, cd, num_new_rows):
    # Assuming class_balance is your augmentation method
    augmented_data = class_balance(a_df, label_column=label_column, cd=cd, augment=False, preserve=True)
    additional_rows = augmented_data.sample(n=num_new_rows, replace=True)
    augmented_data = pd.concat([a_df, additional_rows], ignore_index=True)
    return augmented_data

# Get user input for the number of new rows to augment
num_new_rows = int(input("Enter the number of new rows to augment: "))

# Augment the dataframe with the specified number of new rows
a_df = augment_data(train_combined, label_column=label_column_name, cd=cd, num_new_rows=num_new_rows)

# Print the Time Taken for Execution
print("\n\n\n>>>>>>>>> %s seconds " % (time.time() - start_time))

# Print new augmented dataframe
print("----------------------------- Augmented DataFrame -------------------------\n")
print(a_df)


['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c']


Enter the number of new rows to augment:  240


0
Existing data: 93
New data to be produced: 6
Number of iterations ---> niter_count= 1
14
--- 0.0036280155181884766 seconds ---
14 6

New data generated: 6 
New data: 99 

1
Existing data: 93
New data to be produced: 6
Number of iterations ---> niter_count= 1
10
--- 0.0008018016815185547 seconds ---
10 6

New data generated: 6 
New data: 99 

2
Existing data: 93
New data to be produced: 6
Number of iterations ---> niter_count= 1
9
--- 0.0006361007690429688 seconds ---
9 6

New data generated: 6 
New data: 99 

3
Existing data: 93
New data to be produced: 6
Number of iterations ---> niter_count= 1
8
--- 0.001238107681274414 seconds ---
8 6

New data generated: 6 
New data: 99 

4
Existing data: 93
New data to be produced: 6
Number of iterations ---> niter_count= 1
11
--- 0.0010156631469726562 seconds ---
11 6

New data generated: 6 
New data: 99 

5
Existing data: 93
New data to be produced: 6
Number of iterations ---> niter_count= 1
8
--- 0.0008032321929931641 seconds ---
8 6

New dat

  for label, df_label in data.groupby(label_column):
