In [1]:
%run DataPreProcessing.ipynb

# Stratification

In [2]:
def Stratify(dataset, class_label, num_folds):
    '''
    For classification tasks, data is split by its corresponding ratio in original data, and added by this 
    ratio to each fold
    
    @param dataset: dataframe
    @param class_label: name of class column
    @param num_folds: number of folds
    
    '''
    final_dict = {}
    final_folds = {}
    #Gets all the unique values of each class
    classes = dataset[class_label].unique()
    
    #For each unique value, its ratio in the dataset is calculated and split into folds accordingly
    for key in classes:
        individual_class_dict = {}
        
        #Gets dataframe of the values with current class value
        individual_class_df = (dataset.groupby(dataset[class_label])).get_group(key)
        
        #The number of each class that should go into a fold is calculated
        fold_size = round(len(individual_class_df)/num_folds)
        
        #For each fold, a sample of the data the size of the calcualted fold size for the class is added to a dictionary
        for fold in range(1, num_folds+1):
            if fold_size > len(individual_class_df):
                fold_size = len(individual_class_df)
            individual_fold_df = individual_class_df.sample(n=fold_size)
            individual_class_df=individual_class_df.drop(individual_fold_df.index)
            individual_class_dict[fold] = individual_fold_df
            
        #A final dictionary for each individual class is created with the specified number of folds
        final_dict[key] = individual_class_dict 
        
    #The two class dictionaries are merged together to get the final dictionary 
    for num in range(1, num_folds+1):
        curr_df = pd.DataFrame()
        for key in final_dict:
            temp_df = final_dict[key][num]
            curr_df = curr_df.append(temp_df)
        final_folds[num]=curr_df
    return final_folds

# K Fold Cross Validation

In [3]:
def k_fold_cross_validation(num_folds: int, dataset, classification=False,class_label=None, tuning=False):
    '''
    K fold cross validation algorithm first randomizes the data,
    then calculates the starting and ending indices of the test data
    for each fold based on the length of the data and number of folds.
    Gets the testing and training data based on the calculated indices 
    and adds them to a dictionary split by fold, and returns this dict
    
    @param num_folds: number of folds
    @param dataset: dataframe
    @param classification: boolean value that indicates the task type is classification if True
    @param class_label: name of class column
    
    '''
    
    if classification != True:
        #Separates 20% of data when tuning
        if tuning == True:
            tuning_data = dataset.sample(frac=.2)
            dataset = dataset.drop(tuning_data.index)
        #randomizes dataset
        randomized_data = dataset.sample(frac=1)
        data_size = len(randomized_data)
        test_size = int(data_size/num_folds)

        start_indices = []
        end_indices = []
        curr = 0
        
        #Creates two lists, one for the starting indices of each fold, and one for the ending indices of each fold
        for i in range(num_folds):
            start_indices.append(curr)
            curr = curr+test_size
            end_indices.append((test_size*(i+1))) #because later on iloc will be used which is not inclusive of second index

        folds_dict = {}
        
        #Iterates through each fold number, creating test data based on indices lists, and using remaining data for training
        for i in range(1,num_folds+1):
            folds_dict[i] = {}
            test_data = randomized_data.iloc[start_indices[i-1]:end_indices[i-1]]
            train1 = randomized_data.iloc[:start_indices[i-1]]
            train2 = randomized_data.iloc[end_indices[i-1]:]  
            train_data = pd.concat([train1,train2])
            upd_dict = {'test' : test_data, 'train' : train_data}
            
            #Adds each fold to a final dictionary to return
            folds_dict[i].update(upd_dict)
            
    #For classification tasks    
    else:
        folds_dict = {}
        
        #Gets stratified data from Stratify function
        folds = Stratify(dataset,class_label,num_folds)
        
        #Loops through each fold and splits data into testing and training data, 
        #and adds it to the final_folds dict
        for fold in range(1, num_folds+1):           
            test = folds[fold]
            train = pd.DataFrame()
            for num in range(1, num_folds+1):
                if num != fold:
                    train = train.append(folds[num])
            
            folds_dict[fold] = {'test' : test, 'train' : train}       

    return folds_dict
    
        
    

# K x 2 Cross Validation

In [4]:
def k_x_2_cross_validation(dataset, k, classification = False, class_label = None):
    '''
    K x 2 cross validation algorithm randomizes data, and for each k, splits the data in half and uses half as test 
    and the other half as train, and then switches the halves and uses the opposite half as test and train. This is 
    done k times.
    
    @param dataset: dataframe
    @param k: k value to be used for cross validation
    @param classification: boolean value that indicates the task type is classification if True
    @param class_label: name of class column
    
    '''

    count = 1
    final_folds = {} 
    
    #Loops k number of times
    while count < k+1:
        randomized_data = dataset.sample(frac=1)
        
        #For each k, k-fold cross validation function is called using 2 as the number of folds
        if classification != True:
            curr_k_fold = k_fold_cross_validation(2,randomized_data)
        elif classification == True:        
            curr_k_fold = k_fold_cross_validation(2,randomized_data,classification = True, class_label=class_label)
            
        #The output test and train data is added to a final_folds dict
        final_folds[count]=curr_k_fold
        count = count + 1
        
    return final_folds