In [10]:
import pandas as pd
import numpy as np
from itertools import islice

# Loading Data

In [11]:
def readFile(filename: str):
    '''
    Algorithm that reads in a csv file and returns 
    its contents as a Pandas dataframe
    
    Performs tasks specific to individual datasets
    
    @param filename: name of the csv file to be converted to dataframe
    '''
    
    #Sets default of header names as empty
    header_columns = None
    
    #Assigns column headers for each dataset
    if 'breast-cancer-wisconsin' in filename:
        header_columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
        'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei',
        'Bland Chromatin','Normal Nucleoli','Mitoses','Class']
        
    elif 'car' in filename:
        header_columns = ['buying','maint','doors',
        'persons','lug_boot','safety','acceptability']
        
    elif 'house-votes-84' in filename:
        header_columns = ['Class Name','handicapped-infants','water-project-cost-sharing', 'adoption-of-the-budget-resolution',
        'physician-fee-freeze','el-salvador-aid','religious-groups-in-schools','anti-satellite-test-ban',
        'aid-to-nicaraguan-contras','mx-missile','immigration','synfuels-corporation-cutback','education-spending',
        'superfund-right-to-sue','crime','duty-free-exports','export-administration-act-south-africa']
        
    elif 'abalone' in filename:
        header_columns = ['Sex','Length', 'Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight',
        'Rings']
        
    elif 'machine' in filename:
        header_columns = ['vendor name','Model', 'MYCT', 'MMIN','MMAX','CACH','CHMIN','CHMAX','PRP','ERP']
        
    
    #Reads csv file and converts to dataframe
    file_data = pd.read_csv(filename, index_col = False, names = header_columns)     
    
    #Drops non-feature columns from dataframe
    if 'breast-cancer-wisconsin' in filename: 
        file_data = file_data.drop(columns=['Sample code number'])
    
    if 'machine' in filename: 
        file_data = file_data.drop(columns=['vendor name','Model'])
    
    
    return file_data

# Handling Missing Values

In [12]:
def ImputeMissingValues(dataset, false_missing_values: bool, missing_attribute_value = '?', missing_attribute_replacement = None, contains_missing_values = False):
    '''
    Replaces missing values in dataset
    
    @param dataset: dataframe to handle missing data for
    @param false_missing_values: a True or False boolean value that, if true, inidicates that the missing attribute 
                                    does not represent missing data, and if false, represents missing data
    @param missing_attribute_value: the representation of the missing value in the dataset. Default set to '?'
    @param missing_attribute_replacement: Provides the value to replace with when the missing attribute does not indicate 
                                        missing data(only needed when false_missing_values is True)
    @param contains_missing_values: a True or False boolean value that, when True, indicates that the dataset has
                                    missing values, and when False, contains no missing values
    '''
    #If dataset does not contain missing values, function ends here
    if contains_missing_values != True:
        return dataset
    
    #If dataset contains missing values, the false_missing_values argument is checked to determine what the missing value represents
    else:
        #When data is actually missing, the mean of the feature is used to replace the missing values
        if false_missing_values != True:
            #Each column is checked for specified missing attribute
            for column in dataset:  
                #If missing values exists in the column, the column mean is calculated and replaces the missing values
                if missing_attribute_value in dataset[column].values:
                    feature_mean = int(pd.to_numeric(dataset[column],errors ='coerce').mean())
                    dataset[column] = dataset[column].replace(missing_attribute_value, feature_mean)
        #When data is not actually missing, the missing attribute is replaced with the specified replacement value
        else:
            for column in dataset:
                if missing_attribute_value in dataset[column].values:
                    dataset[column] = dataset[column].replace(missing_attribute_value, missing_attribute_replacement)
        return dataset
    
    

# Handling Categorical Data

In [13]:
def EncodeOrdinalData(dataset, ordinal_mapping = None):
    '''
    Encodes ordinal data using integer mapping 
    
    @param dataset: dataframe to be encoded
    @param ordinal_mapping: a nested dictionary mapping each value in columns with ordinal data to integers
                            (When empty, no nominal data is assumed.)
    '''
    
    #If dataset contains ordinal data, use dictionary values to encode data
    if ordinal_mapping != None:
        for attribute in ordinal_mapping:
            #For each column with ordinal data, replace values with encoding integers specified in mapping dictionary
            for key in ordinal_mapping[attribute]:
                dataset[attribute] = dataset[attribute].replace(key, ordinal_mapping[attribute][key])
        return dataset
    return dataset
    
            
    

In [14]:
def EncodeNominalData(dataset, nominal_column_names=None):
    '''
    Encodes nominal data using one-hot encoding
    
    @param dataset: dataframe to be encoded
    @param nominal_column_names: a list of column names containing nominal data. When empty, 
                                 no nominal data is assumed.
    '''
    if nominal_column_names != None:
        #Loops through each specfied nominal column, gets the unique values from those columns, 
        #and stores it in a list 'category_values'
        for column in nominal_column_names:
            category_values = dataset[column].unique()
            num_categories = len(category_values)
            one_hot_encoded_data = [0]*num_categories
            #An empty list of 0s the size of the number of unique values is created, and in each iteration,
            #the corresponding binary value is generated in list form
            for number in range(num_categories):
                empty_dummies = [0]*num_categories
                empty_dummies[number]=1
                one_hot_encoded_data[number]=str(empty_dummies)
            
            #Sets all values in the new binary columns 0
            dataset[category_values] = 0
            
            #Changes the binary column values to a corresponding 1 if the column name corresponds with
            #the value at that index in the original nominal column
            for value in category_values:
                dataset[value].mask(dataset[column] == value, 1, inplace=True)
        #Drops the original nominal column
        dataset = dataset.drop(columns=nominal_column_names)
                                  
        return dataset
    return dataset

# Discretization

In [15]:
def Discretize(dataset, num_bins, discretization_type):
    '''
    Discretizes the data into groups and replaces the original data of these groups with a single value
    
    @param dataset: dataframe to be discretized
    @param num_bins: number of groups to separate the data into
    @param: discretization_type: either "frequency" or "width". Indicates whether to perform equal-width 
                                discretization or equal-frequency discretization
    '''
    #Initialing the dataframe which will be used to store and return the final results
    discretized_df = pd.DataFrame()
    
    #Loops through each column to perform discretization on each feature
    for column in dataset:
        #Calculates the size of each bin by using range of data divided by number of bins
        max_val = dataset[column].max()
        min_val = dataset[column].min()
        bin_size = (max_val - min_val)/num_bins
        
        #Equal-frequency discretization
        if discretization_type == 'width':
            #Initializes the dictionary which will map each bin to its range/size
            bin_ranges = {}
            count = 0
            
            #Loops through each bin, and calculates the range for that bin, adds all the values 
            #in the column within that range to the bin, and adds the values in the bin to the bin_ranges dict
            for index in range(num_bins):
                bin_values = []
                current_range = [int(count),int(count+bin_size)]
                bin_ranges[str(current_range)]={}
                for value in dataset[column]:
                    if value in range(current_range[0],current_range[1]):
                        bin_values.append(value)
                bin_ranges[str(current_range)] = bin_values
                count = current_range[1]
            return bin_ranges
        
        #Equal-frequency discretization
        elif discretization_type == 'frequency':
            #Splits the dataevenly into the as many parts as the specified number of bins
            bins = np.array_split((dataset[column].to_numpy()),num_bins)
            new_feat_list = []
            #Calculates the average of each bin, and adds the values to a list
            for i in bins:
                new_feat_list.append(i.mean())
                
        #Adds the list averages to the final dataframe to be returned
        discretized_df[column] = new_feat_list
    return discretized_df

# Standardization

In [16]:
def Standardize(train_data,test_data):
    '''
    Standardize training data using z-score standardization and apply to test data
    
    @param train_data: training data
    @param test_data: test data
    
    '''
    for feature in train_data:
        #Calculate the average and standard deviation of the training data
        avg = train_data[feature].mean()
        std = train_data[feature].std()
        z_score_train_list = []
        z_score_test_list = []
        
        #Calculate z_score for each point in the train data and add to list
        for x in train_data[feature]:
            z_score = (x-avg)/std
            z_score_train_list.append(z_score)
            
        #Calculate z_score for each point in the test data  
        for x in test_data[feature]:
            z_score = (x-avg)/std
            z_score_test_list.append(z_score)
            
        #Change each column to standardized data
        train_data[feature] = z_score_train_list
        test_data[feature] = z_score_test_list
        
    return train_data,test_data