# HAT
- Histogram Augmentation Technique (HAT) is used widely to augment and classify any tabular data
- HAT is designed such that the generated data retains the distribution of the original tabular data histogram
- HAT analyses the data distribution of a particular feature and based on the feature type (i.e. continuous or discrete) it generates new samples

# Import Libraries 

In [2]:
#IMPORT important libraries
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.model_selection import train_test_split
pd.DataFrame.iteritems = pd.DataFrame.items

# Load dataset: STACKED DISTRIBUTION

In [None]:
data1 = pd.read_csv("/Users/fabianafazio/Documents/GitHub/BP24/Ellee/Data/Stacked/stacked_orig.csv", header=None)

# categorical columns have index: 16, 17, 18, 19, 20, 21, 22, 23, 24 (including label column)
for column in data1.columns[-9:]:
    data1[column] = data1[column].astype('category')

# Verify the changes
print(data1.dtypes)

# data1
X = data1.iloc[:, :-1]
y = data1.iloc[:, -1]


# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Combining X_train and y_train into one DataFrame
train_combined = pd.concat([X_train, y_train], axis=1)
train_combined

# Load dataset: GAUSSIAN DISTRIBUTION

In [None]:
data1 = pd.read_csv("/Users/fabianafazio/Documents/GitHub/BP24/Ellee/Data/Gaussian/gaussian_orig.csv", header=None)

# categorical columns have index: 2, 3, 7, 9

# List of column indices that should be converted to categorical
categorical_columns = [2, 3, 7, 9, 12]

for i in categorical_columns:
    data1.iloc[:, i] = data1.iloc[:, i].astype('category') # Convert to categorical type

# Display the data types of the columns to verify the changes
print(data1.dtypes)

# data1
X = data1.iloc[:, :-1]
y = data1.iloc[:, -1]


# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Combining X_train and y_train into one DataFrame
train_combined = pd.concat([X_train, y_train], axis=1)
train_combined

# Load dataset: UNIFORM DISTRIBUTION

In [4]:
data1 = pd.read_csv("/Users/fabianafazio/Documents/GitHub/BP24/Ellee/Data/Uniform/uniform_orig.csv", header=None)

# List of column indices that should be converted to categorical
categorical_columns = [2, 7, 10, 15, 24]

for i in categorical_columns:
    data1.iloc[:, i] = data1.iloc[:, i].astype('category') # Convert to categorical type

# Display the data types of the columns to verify the changes
print(data1.dtypes)

# data1
X = data1.iloc[:, :-1]
y = data1.iloc[:, -1]


# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Combining X_train and y_train into one DataFrame
train_combined = pd.concat([X_train, y_train], axis=1)
train_combined

0      float64
1      float64
2     category
3      float64
4      float64
5      float64
6      float64
7     category
8      float64
9      float64
10    category
11     float64
12     float64
13     float64
14     float64
15    category
16     float64
17     float64
18     float64
19     float64
20     float64
21     float64
22     float64
23     float64
24    category
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
114,0.777194,0.979046,0.0,0.972416,1.163564,1.116164,0.925770,1.0,0.265845,0.809923,...,1.0,0.206132,0.928457,0.497371,0.479986,0.666670,0.946705,1.098410,0.758760,1.0
7,0.387182,0.047621,1.0,0.359007,0.045622,0.118648,0.612987,1.0,0.586259,0.856810,...,1.0,0.797418,0.867424,0.436008,0.648203,0.262512,0.812762,0.339305,0.339485,0.0
137,0.522665,0.325059,0.0,0.720903,0.886621,0.538945,0.919042,0.0,0.382281,0.064168,...,0.0,0.152838,0.262323,0.906625,0.306314,0.678287,0.785663,0.402582,0.297149,0.0
331,0.434863,0.397570,1.0,0.320027,0.254633,0.978688,0.758991,1.0,0.656134,0.058100,...,0.0,0.966117,0.444518,0.321084,0.220322,0.224088,0.701610,0.118394,0.837120,0.0
304,0.277995,0.796485,0.0,0.853256,0.531138,0.721808,0.686932,1.0,0.887305,0.054893,...,1.0,0.676859,0.765212,0.993014,0.586485,0.743334,0.180314,0.390712,0.262127,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,1.116368,1.174533,0.0,1.093417,1.138848,1.189892,1.161549,0.0,0.724864,0.487290,...,1.0,0.497882,0.484487,1.160946,1.037247,1.193710,1.063586,0.617590,0.206816,1.0
71,0.100461,0.897078,0.0,0.641122,0.048721,0.482094,0.558388,1.0,0.959848,0.572503,...,0.0,0.261237,0.636649,0.661237,0.506090,0.107249,0.282784,0.842817,0.899592,0.0
106,0.991253,0.441954,1.0,0.587918,0.648145,0.687545,0.679813,0.0,0.466455,0.926841,...,0.0,0.446260,0.611192,0.401744,1.032118,1.148253,0.521070,0.371581,0.910811,1.0
270,0.530878,0.771605,0.0,0.118251,0.196508,0.859618,0.486844,0.0,0.241833,0.376559,...,0.0,0.418512,0.042595,0.440660,0.669904,0.597713,0.522333,0.824154,0.712669,0.0


# HAT code

In [10]:
#define histogram function
#data: This is the original dataset that you want to use for generating new data
#no_new_data: This parameter indicates how much new data you want to generate
#data_feat: This parameter specifies the type of data feature. It can be either 'c' for continuous data or 'd' for discrete data.
#preserve: This parameter determines whether to preserve the original dataset or not while generating new data. If set to True, the original dataset will be included in the generated data; otherwise, it won't be included.
def histogram_sampler(data, no_new_data, data_feat, preserve):

    if (data_feat == 'c'):
        start_time = time.time()
        print("Existing data:", len(data))
        print("New data to be produced:", no_new_data)

#Function parameters
#X_new = New data for each iteration
#len_X_new = length of the newly generated data
#iter_count = number of iterations
#data_gen = Augmented data (original data + newly generated data)
        
        X_new = []
        len_X_new = len(X_new)
        iter_count = 0
        data_gen = data


         # Histogram calculation and sampling logic goes here...
        # Adjust the condition for the while loop
        while (len_X_new < 0.7 * no_new_data):
            iter_count += 1
            print("Number of iterations ---> niter_count=", iter_count)

            
#Histogram: Generating the histogram , choosing the mid-value of the bins, and normalizing frequency
#fd- Freedman–Diaconis rule is employed to choose the bin size, as it depends on the spread of the data, without any presumption
            if (iter_count == 1):
                Y,X_interval=np.histogram(data_gen,bins='doane')
                n_bins = len(Y)
            else:
                Y,X_interval=np.histogram(data_gen,bins=n_bins)

            
            X = ((X_interval[0:-1] + X_interval[1:])/2) 
            Y = Y/max(Y)

            bin_val = list(np.round(X,8))
            weight = list(Y)
            hist = dict(zip(bin_val,weight))

            for xi in bin_val[0:-1]:

#Values: choosing the values for undergoing validity check

                bin_width = ((max(bin_val) - min(bin_val)) / int(len(bin_val)-1))
                xm = xi + (bin_width/2)
                x1 = xi
                y1 = hist[xi]

                res = None
                temp = iter(hist)
                for key in temp:
                    if(key == xi):
                        res = next(temp,None)

                y2 = hist[res]
                ym = ((y1+y2)/2)
#Validity check: checking if the specified value can be considered
#if(no_new_data <= len(data)):
#ym = ym*(np.random.rand()<=ym)
#y1 = y1*(np.random.rand()<=y1)
    
                #else:
                ym = ym*(abs(np.random.normal(0,0.5))<=ym)
                y1 = y1*(abs(np.random.normal(0,0.5))<=y1)

#Appending: appending the valid values
                

                if (ym!=0):
                    X_new.append(np.round(xm,8))
                    #X_new.append(np.round(xm+0.1*xm,8))
                    #X_new.append(np.round(xm-0.1*xm,8))
                if (y1!=0):
                    X_new.append(np.round(x1,8))
                    #X_new.append(np.round(x1+0.1*xm,8))
                    #X_new.append(np.round(x1-0.1*xm,8))
#Stopping: bins * 2, length check

            data_gen = data_gen + X_new
            n_bins = n_bins*2     
            len_X_new+= len(X_new)
            print(len_X_new)
            X_new = []
            print("--- %s seconds ---" % (time.time() - start_time))


        print(len(data_gen)-len(data),no_new_data)

        if(len(data_gen)-len(data) >= no_new_data):
            data_gen = data_gen[:len(data)] + list(np.random.choice(data_gen[len(data):], no_new_data, replace = False))
            print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n')
            #sns.distplot(data_gen)
            if(preserve == False):
                data_gen = data_gen[len(data):]
            return data_gen

        else:
            print('to discrete...', no_new_data - (len(data_gen)-len(data)))
            samples = histogram_sampler(data_gen, no_new_data - (len(data_gen)-len(data)), 'd', preserve = True)
            if(preserve == False):
                samples = samples[len(data):]
            return samples
            
    elif(data_feat == 'd'):
        X_new=[]
        data_gen=[]
        disc_data= list(set(data))

        for i in disc_data: 
            x=data.count(i)
            X_new.append(round(x*(no_new_data) / len(data)))
        #print(x_new,sum(x_new))

        for j in range(0,len(X_new)):
            for i in range(X_new[j]):
                data_gen.append(disc_data[j])

        if(len(data_gen)==0):
            data_gen = data + data_gen

        print(no_new_data, sum(X_new))            
        if(no_new_data > sum(X_new)):
            data_gen = data + data_gen
            data_gen = list(data_gen + list(np.random.choice(data_gen,int(no_new_data-sum(X_new)),replace = False)))    

        data_gen = list(np.random.choice(data_gen,int(no_new_data),replace = False))   
        
        if(preserve == True):
            data_gen = data + data_gen
    
        #sns.distplot(data_gen)     
        print('\nNew data generated:', len(data_gen[len(data):]), '\nNew data:', len(data_gen), '\n') 
        return data_gen
        
    else:
        print('NA')


In [12]:
#df_class : This parameter represents the DataFrame containing the data.
#diff :This parameter specifies the difference or ratio between the number of samples in the smallest and largest classes after splitting
#label_name: It's the column in your DataFrame that you want to use for splitting the data.
#cd: It can take values 'c' for continuous classes or 'd' for discrete classes.
#label_column: It helps the function identify which column contains the labels or classes.
#preserve: If set to `True`, the original dataset will be included in the split data; otherwise, it won't be included.
def label_split(df_class, diff, label_name , cd, label_column, preserve):
    cdi = 0
    columns_data ={}
    
    df_ = pd.DataFrame(columns=[])
    del df_class[label_column]
    
    for (columnName, columnData) in df_class.iteritems(): 
        print(columnName)
        feat_type = cd[cdi]
        df_[columnName] = histogram_sampler(list(columnData.values), diff, feat_type, preserve)
        cdi+=1
    
    df_[label_column] = label_name
    print(df_)

    return df_




In [14]:
def class_balance(data, label_column, cd, augment = False, preserve = True):
    split_list=[]
    #label_column = 'species'
    for label, df_label in data.groupby(label_column):
        split_list.append(df_label)

    maxLength = max(len(x) for x in split_list)

    if(augment == False):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = maxLength - len(split_list[i])
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == dict):
        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = augment[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])

    elif(type(augment) == int):

        label_count = dict(Counter(list(df[label_column])))
        count_key = list(label_count.keys())
        count_val = list(label_count.values())

        for i in range(0,len(count_val)):
            count_val[i] = round(count_val[i] * augment  /sum(list(label_count.values())))


        while(sum(count_val) != augment):
            if(sum(count_val) > augment):
                rand_indx = int(np.random.rand() * len(count_val))
                if(count_val[rand_indx] > 0):
                    count_val[rand_indx]-= 1

            else:
                rand_indx = int(np.random.rand() * len(count_val))
                count_val[rand_indx]+= 1

        new_count = dict(zip(count_key, count_val))

        augmented_list=[]
        for i in range(0,len(split_list)):

            label_name = list(set(split_list[i][label_column]))[0]
            diff = new_count[label_name]
            augmented_list.append(label_split(split_list[i], diff, label_name, cd, label_column, preserve))
        finaldf = pd.DataFrame(columns=[])


        #finaldf = class_balance(data, label_column, cd, augment = new_count, preserve = preserve)


    for i in range(0,len(split_list)):
        finaldf = pd.concat([finaldf,augmented_list[i]],axis=0)

    return finaldf
     

# This is where you begin coding!
- Below: Only change --> for i in range(0, 24):
- 24 is the number of columns in your dataset

# Run HAT Augmentation

In [16]:
# Start Timing the Execution: (do not change this)
start_time = time.time()

# Create a List cd with Specific Values and prints it
cd = []
for i in range(0, 24):  # you need to input your number of columns
    cd.append('c')
print(cd)

# Assuming train_combined is your dataframe
label_column_index = 24  # Index of the label column
label_column_name = train_combined.columns[label_column_index]

# Function to augment the data
def augment_data(a_df, label_column, cd, num_new_rows):
    # Assuming class_balance is your augmentation method
    augmented_data = class_balance(a_df, label_column=label_column, cd=cd, augment=False, preserve=True)
    additional_rows = augmented_data.sample(n=num_new_rows, replace=True)
    augmented_data = pd.concat([a_df, additional_rows], ignore_index=True)
    return augmented_data

# Get user input for the number of new rows to augment
num_new_rows = int(input("Enter the number of new rows to augment: "))

# Augment the dataframe with the specified number of new rows
a_df = augment_data(train_combined, label_column=label_column_name, cd=cd, num_new_rows=num_new_rows)

# Print the Time Taken for Execution
print("\n\n\n>>>>>>>>> %s seconds " % (time.time() - start_time))

# Print new augmented dataframe
print("----------------------------- Augmented DataFrame -------------------------\n")
print(a_df)


['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c']


Enter the number of new rows to augment:  343


0
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

1
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

2
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

3
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

4
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

5
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

6
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

7
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

8
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

9
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New data: 138 

10
Existing data: 138
New data to be produced: 0
0 0

New data generated: 0 
New

  for label, df_label in data.groupby(label_column):


In [18]:
# subset how many rows you want from the augmented data into a new dataframe EX: the last 240 ROWS
# Select the last 240 rows of 'a_df'
new_df = a_df.iloc[-172:]

# Save the new DataFrame 'new_df' as a CSV file
new_df.to_csv('uniform_HAT.csv', index=False)


In [20]:
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
445,0.647997,0.171927,0.0,0.579001,0.367139,0.045263,0.269086,1.0,0.066456,0.977853,...,1.0,0.544970,0.139710,0.798742,0.370587,0.922807,0.292901,0.083943,0.283849,0.0
446,0.096340,0.454912,1.0,0.913147,0.792143,0.135138,0.365667,1.0,0.424225,0.317371,...,1.0,0.721881,0.217003,0.063891,0.012188,0.688933,0.846744,0.393455,0.581004,0.0
447,0.322312,0.043735,0.0,0.202719,0.484040,0.009892,0.312036,1.0,0.124079,0.046886,...,0.0,0.244599,0.593536,0.202589,0.574037,0.144693,0.721238,0.704056,0.108447,0.0
448,0.093631,0.360886,0.0,0.713265,0.007734,0.383885,0.516563,1.0,0.851740,0.254615,...,1.0,0.840684,0.216404,0.103304,0.347502,0.790289,0.130810,0.489675,0.122571,0.0
449,0.000782,0.055668,1.0,0.671831,0.732918,0.319376,0.982384,1.0,0.131994,0.000773,...,0.0,0.791995,0.652979,0.576618,0.709723,0.818492,0.122903,0.320141,0.793839,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.745409,0.394048,1.0,0.872813,0.878936,0.458882,0.551148,1.0,0.577807,0.908525,...,1.0,0.221164,0.394317,0.393648,0.306864,0.869927,0.721142,0.876365,0.787747,1.0
613,1.090795,1.096865,1.0,0.418429,0.380409,0.245692,0.790996,0.0,0.902590,0.575708,...,1.0,0.615547,0.699440,1.140873,0.630426,0.664648,0.407087,0.411530,0.912365,1.0
614,0.674339,1.012849,1.0,1.063909,0.340770,0.841638,0.599698,1.0,0.314111,1.161395,...,0.0,0.922132,1.037371,0.687103,0.854592,0.488219,0.813583,0.318446,0.756748,1.0
615,0.530878,0.771605,0.0,0.118251,0.196508,0.859618,0.486844,0.0,0.241833,0.376559,...,0.0,0.418512,0.042595,0.440660,0.669904,0.597713,0.522333,0.824154,0.712669,0.0
