## Reduce Multi-class Dataset to Binary Dataset  
This notebook will modify our original multi-class dataset to a 2-class dataset.  
This is in order to reduce our multi-class classification problem to a binary classification problem.

In [62]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import time
from pathlib import Path
import os

### Set Up ```sys``` Path to Enable ```.py``` Imports

In [63]:
path = Path.cwd()
path_to_project_directory = path.parent
sys.path.insert(1, str(path_to_project_directory))
print(f"The working directory has been set to: {str(path_to_project_directory)}")

The working directory has been set to: /Users/nelsonfarrell/Documents/Northeastern/5220/final_project


### Import ```.py``` Modules

In [None]:
# None

### Helper Functions

In [29]:
def get_all_possible_binary_datasets(original_df:pd.DataFrame, label:str) -> dict:
    """
    Reduces a multi-class data frame to a binary data frame

    Args:
        * original_df: (pd.DataFrame) - The original multiclass data frame.
        * label: (str) - The column name of the label column.
        
    Returns:
        * dict: A dictionary containing all possible 2 class data frames.
    """
    segmentation_values = list(original_df[label].unique())
    results = {}
    for i in range(len(segmentation_values)):
        for j in segmentation_values[i + 1:]:
            filtered_df = original_df[(original_df[label] == segmentation_values[i]) | 
                                    (original_df[label] == j)]
            label_1, label_2 = filtered_df[label].unique()[0], filtered_df[label].unique()[1]
            filtered_df_name = f"filtered_df_{label_1}_{label_2}"
            results[filtered_df_name] = filtered_df
    return results

### Params

In [60]:
new_data_folder_name = "data/filtered_datasets"
path_to_original_data = "../data/raw_from_kaggle/Train.csv"
label_col_name = "Segmentation"
data_set_A_B = "data_set_A_B.csv"
data_set_A_C = "data_set_A_C.csv"
data_set_B_C = "data_set_B_C.csv"
data_set_D_A = "data_set_D_A.csv"
data_set_D_B = "data_set_D_B.csv"
data_set_D_C = "data_set_D_C.csv"


### Read in Original Data

In [61]:
original_df = pd.read_csv(path_to_original_data)

### Generate All Possible Binary Datasets

In [32]:
results_dict = get_all_possible_binary_datasets(original_df, label_col_name)

### Check Destination Folder

In [41]:
os.makedirs(os.path.join(str(path_to_project_directory), new_data_folder_name), exist_ok = True)

### Save Binary Datasets as ```csv``` Files  
There will be 6 binary datasets

In [43]:
filtdered_df_list = [data_set_D_A, data_set_D_B, data_set_D_C, data_set_A_B, data_set_A_C, data_set_B_C]
for key, dataset in zip(results_dict.keys(), filtdered_df_list):
    df = results_dict[key]
    df.to_csv(os.path.join(path_to_project_directory, new_data_folder_name, dataset))