# Wisconsin BCa Preprocess

Preprocessing Wisconsin BCa data, based on [this kaggle thing](https://www.kaggle.com/code/merturper/breast-cancer-outliers-pca-nca#Correlation-Analysis)

## Read and Check Data

In [None]:
import pandas as pd

data_path = '../../data/wisconsin.csv'
df = pd.read_csv(data_path)
df.head()

In [None]:
df.info()

In [None]:
# we are only interested in the diagnosis_01, so drop `diagnosis`
df.drop(['id', 'diagnosis'], inplace = True, axis = 1)

In [None]:
df.describe().T


In [None]:
# checking for null values

df.isnull().sum()

## Distribution of Features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.figure(figsize = (14, 20))
plotnumber = 1

for column in df:
    if plotnumber <= 30:
        ax = plt.subplot(10, 3, plotnumber)
        sns.distplot(df[column])
        plt.xlabel(column)

    plotnumber += 1

plt.tight_layout()
plt.show()

warnings.filterwarnings('default')

## Correlation Analysis

"Feature diversity is important in model training. However, as you can see in the heatmap, there are columns that are highly correlated with many features other than the target variable. We can remove these features."

In [None]:
# Let's check the correlation between the variables 
import numpy as np

plt.figure(figsize = (20, 12))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask, linewidths = 1, annot = True, fmt = ".2f", cmap='YlGnBu')
plt.show()

In [None]:
# features with higher than threshold(0.96) correlation
corr_matrix = df.corr().abs() 

mask = np.triu(np.ones_like(corr_matrix, dtype = bool))
tri_df = corr_matrix.mask(mask)

high_corr = [x for x in tri_df.columns if any(tri_df[x] > 0.96)]
high_corr

In [None]:
# I selected 4 features by comparing the output of high_corr with the corr graph
df = df.drop(["perimeter_mean","area_mean","radius_mean","radius_worst"], axis = 1)

In [None]:
# correlation after drop
plt.figure(figsize = (20, 12))

corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask = mask, linewidths = 1, annot = True, fmt = ".2f", cmap='YlGnBu')
plt.show()

In [None]:
# Correlation with target
df.drop('diagnosis_01', axis=1).corrwith(df.diagnosis_01).plot(kind='bar', grid=True, figsize=(12, 10), title="Correlation with target");

## Outlier-1 (boxplot method)

Drop outliers

In [None]:
# scalar process to observe outliers in a single graph
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler

In [None]:
scaled_array = scaler.fit_transform(df.iloc[:,1:])
scaled_array

In [None]:
df1 = pd.DataFrame(scaled_array, columns = df.iloc[:,1:].columns)

In [None]:
df1 = pd.concat([df1, df.loc[:, "diagnosis_01"]], axis = 1)
df1.head(3)

In [None]:
from collections import Counter 

def detect_outliers(df,features):             
    outlier_indices = []                      
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c],25)
        # 3rd quartile
        Q3 = np.percentile(df[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)       
                                                                                
            
    outlier_indices = Counter(outlier_indices)         
    # adds to the list if there are more than 9 outliers in the sample                                                   
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 9)   
                                                                               
    return multiple_outliers

In [None]:
# distribution before removing outliers
warnings.filterwarnings('ignore')
data_melted = pd.melt(df1, id_vars = "diagnosis_01", var_name = "features", value_name = "value")

plt.figure(figsize = (14,8))
sns.boxplot(x = "features", y = "value", hue = "diagnosis_01", data = data_melted)
plt.xticks(rotation = 90)
plt.show()
warnings.filterwarnings('default')

In [None]:
# detect outliers

# all column names that aren't the diagnosis
not_diagnosis_cols = [colname for colname in df1.columns if colname != 'diagnosis_01']
df1.loc[detect_outliers(df1, not_diagnosis_cols)]

In [None]:
# drop outliers
df1 = df1.drop(detect_outliers(df1, not_diagnosis_cols),axis = 0).reset_index(drop = True)
df1.shape

In [None]:
data_melted = pd.melt(df1, id_vars = "diagnosis_01", var_name = "features", value_name = "value")

plt.figure(figsize = (14,8))
sns.boxplot(x = "features", y = "value", hue = "diagnosis_01", data = data_melted)
plt.xticks(rotation = 90)
plt.show()

# Upload `df1` as a new version

Upload as a new version of the dataset ready for training the models

Source: https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-explore-data?view=azureml-api-2#create-a-new-version-of-the-data-asset

In [None]:
# save df1 to file system
import os

os.makedirs('./data', exist_ok=True)
df1.to_parquet("./data/cleaned-wisconsin-boxplot.parquet")


## Outlier-2 (LOF Method)

In [None]:
y = df.diagnosis_01
X = df.drop(["diagnosis_01"],axis = 1)
columns = X.columns.tolist()

In [None]:
from sklearn.neighbors import LocalOutlierFactor

clf = LocalOutlierFactor()
y_pred = clf.fit_predict(X)
X_score = clf.negative_outlier_factor_

In [None]:
outlier_score = pd.DataFrame()
outlier_score["score"] = X_score

In [None]:
threshold = -2
filtre = outlier_score["score"] < threshold
outlier_index = outlier_score[filtre].index.tolist()
outlier_index

In [None]:
X = X.drop(outlier_index)
y = y.drop(outlier_index).values

In [None]:
# combine X and y back into a dataframe
dfY = df.diagnosis_01.drop(outlier_index)
df2 = pd.merge(X, dfY, left_index=True, right_index=True)

## Upload df2 as a new version

In [None]:
# save df2 to file system
import os

os.makedirs('./data', exist_ok=True)
df2.to_parquet("./data/cleaned-wisconsin-lof.parquet")

In [None]:
df2.head()