# **Data Preprocessing Binary Dataset**

Different machine learning methods require the data to be formatted in different ways.

Decision Trees work best with Categorical data, Neural Networks work best with Data Normalised to (0 and 1)

This file takes the binary dataset for lung cancer and creates several modified versions of the dataset to be used for model training and experiementing 

The choice of modifications is based on the analysis, the needs of different models and for experimental reasons

## Necessary Imports

In [25]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import os
#import tarfile
#import urllib
import pandas as pd
#import urllib.request

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

## Loading in original dataset

In [10]:
LUNG_CATEGORICAL_PATH = os.path.join("..", "datasets", "2_categorical", "unprocessed")

def load_lung_categorical(lung_binary_path=LUNG_CATEGORICAL_PATH):
    csv_path = os.path.join(lung_binary_path, "2_categorical.csv")
    return pd.read_csv(csv_path)

categorical_og = load_lung_categorical() 

categorical_og.head() #Display first five rows of the frame

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,Occupational Hazards,Genetic Risk,Chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


## Modifying dataset and saving as a file

#### Dropping unnesessary columns

In [11]:
# Drop the 'Patient Id' and 'Age' columns in place
categorical_og.drop(columns=['Patient Id', 'index'], inplace=True)

# Verify if the columns were dropped
print("Columns after dropping 'Patient Id' and 'index':", categorical_og.columns)

Columns after dropping 'Patient Id' and 'index': Index(['Age', 'Gender', 'Air Pollution', 'Alcohol use', 'Dust Allergy',
       'Occupational Hazards', 'Genetic Risk', 'Chronic Lung Disease',
       'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Chest Pain',
       'Coughing of Blood', 'Fatigue', 'Weight Loss', 'Shortness of Breath',
       'Wheezing', 'Swallowing Difficulty', 'Clubbing of Finger Nails',
       'Frequent Cold', 'Dry Cough', 'Snoring', 'Level'],
      dtype='object')


#### 1 - Orginal dataset with all numerical attributes

In [None]:
# Map the 'Level' column from strings to numerical values
categorical_og['Level'] = categorical_og['Level'].map({'Low': 1, 'Medium': 2, 'High': 3})

# Display the first few rows to confirm the change
categorical_og.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,Occupational Hazards,Genetic Risk,Chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,3
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,3
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,3


Other attributes are already in numerical format

Save dataframe to file

In [24]:
# Specify the folder path
SAVE_PATH = os.path.join("..", "datasets", "2_categorical", "processed")

# Specify file names (to be saved)
og_file_path = os.path.join(SAVE_PATH, "2_og.csv")

# Check if the file already exists
if not os.path.exists(og_file_path):
    # Save the DataFrame if it doesn't exist
    categorical_og.to_csv(og_file_path, index=False)
    print(f"File saved at {og_file_path}")
else:
    print(f"File already exists at {og_file_path}")

File saved at ../datasets/2_categorical/processed/2_og.csv


#### 2 - Original dataset with features converted to string values
[1,2,3,4,5,6,7,8] to ["None","Low","Medium Low","Moderate","Moderately High","High", "Very High", "Severe"] and 'Age' to ['YOUNG ADULT','ADULT','OLDER ADULT','ELDERLY']

In [None]:
# Copy the original dataframe
categorical_string = categorical_og.copy()

# Define mappings for features with 8 discrete values
nine_value_mapping = {
    1: "None",
    2: "Very Low",
    3: "Low",
    4: "Medium Low",
    5: "Moderate",
    6: "Moderately High",
    7: "High",
    8: "Very High",
    9: "Severe"
}

# Define mappings for features with 8 discrete values
eight_value_mapping = {
    1: "None",
    2: "Low",
    3: "Medium Low",
    4: "Moderate",
    5: "Moderately High",
    6: "High",
    7: "Very High",
    8: "Severe"
}

# Define mappings for features with 7 discrete values
seven_value_mapping = {
    1: "None",
    2: "Low",
    3: "Medium Low",
    4: "Moderate",
    5: "Moderately High",
    6: "High",
    7: "Severe"
}

# Apply mappings to columns with 9 discrete values 
nine_value_columns = ['Chest Pain', 'Coughing of Blood', 'Fatigue', 'Shortness of Breath','Clubbing of Finger Nails']
categorical_string[nine_value_columns] = categorical_string[nine_value_columns].applymap(nine_value_mapping.get)

# Apply mappings to columns with 8 discrete values 
eight_value_columns = ['Air Pollution', 'Alcohol use', 'Dust Allergy', 'Occupational Hazards', 'Smoking', 'Passive Smoker', 'Wheezing','Swallowing Difficulty','Weight Loss']
categorical_string[eight_value_columns] = categorical_string[eight_value_columns].applymap(eight_value_mapping.get)

# Apply mappings to columns with 7 discrete values 
seven_value_columns = ['Genetic Risk', 'Chronic Lung Disease', 'Balanced Diet', 'Obesity', 'Frequent Cold', 'Dry Cough', 'Snoring']
categorical_string[seven_value_columns] = categorical_string[seven_value_columns].applymap(seven_value_mapping.get)

# Bin 'Age' into 4 categories: ['YOUNG ADULT', 'ADULT', 'OLDER ADULT', 'ELDERLY']
age_bins = pd.cut(categorical_string['Age'], bins=4, labels=['YOUNG ADULT', 'ADULT', 'OLDER ADULT', 'ELDERLY'])
categorical_string['Age'] = age_bins

# Map 'Gender' column: 1 to 'M' and 2 to 'F'. This is a guess, as the number for gender is not specified in the dataset
categorical_string['Gender'] = categorical_string['Gender'].map({1: 'M', 2: 'F'})

# Set display option to show all columns
pd.set_option('display.max_columns', None)

# Display the first few rows of the transformed dataset
categorical_string.head()


Save this version to file

In [None]:
# Specify file names (to be saved)
str_file_path = os.path.join(SAVE_PATH, "2_str.csv")

# Check if the file already exists
if not os.path.exists(str_file_path):
    # Save the DataFrame if it doesn't exist
    categorical_string.to_csv(str_file_path, index=False)
    print(f"File saved at {str_file_path}")
else:
    print(f"File already exists at {str_file_path}")


File saved at ../datasets/2_categorical/processed/2_str.csv


#### 3 - dataset with values normalized to between 0 and 1

In [27]:
# Create a copy of categorical_og to preserve the original data
categorical_nrml = categorical_og.copy()

# Initialize MinMaxScaler to scale features to a range of [0, 1]
scaler = MinMaxScaler()

# Fit and transform all columns
categorical_nrml[:] = scaler.fit_transform(categorical_nrml)

# Display the first few rows of the normalized dataset to confirm
categorical_nrml.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,Occupational Hazards,Genetic Risk,Chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0.322034,0,0.142857,0.428571,0.571429,0.428571,0.333333,0.166667,0.166667,0.5,...,0.25,0.428571,0.125,0.142857,0.285714,0.0,0.166667,0.333333,0.5,0.0
1,0.050847,0,0.285714,0.0,0.571429,0.285714,0.5,0.166667,0.166667,0.166667,...,0.0,0.285714,0.75,1.0,0.714286,0.125,0.0,1.0,0.166667,0.5
2,0.355932,0,0.428571,0.571429,0.714286,0.571429,0.666667,0.5,0.833333,1.0,...,0.875,0.857143,1.0,0.142857,0.0,0.375,0.833333,1.0,0.166667,1.0
3,0.389831,0,0.857143,0.857143,0.857143,0.857143,0.833333,1.0,1.0,1.0,...,0.375,0.142857,0.25,0.0,0.428571,0.5,0.833333,1.0,0.666667,1.0
4,0.542373,0,0.714286,1.0,0.857143,0.857143,1.0,0.833333,1.0,1.0,...,0.25,0.142857,0.375,0.0,0.428571,0.125,0.5,0.166667,0.333333,1.0


Save this version to file

In [29]:
# Specify file names (to be saved)
nrml_file_path = os.path.join(SAVE_PATH, "2_nrml.csv")

# Check if the file already exists
if not os.path.exists(nrml_file_path):
    # Save the DataFrame if it doesn't exist
    categorical_nrml.to_csv(nrml_file_path, index=False)
    print(f"File saved at {nrml_file_path}")
else:
    print(f"File already exists at {nrml_file_path}")

File saved at ../datasets/2_categorical/processed/2_nrml.csv
