# Anomaly Detection Notebook: Chapter 1
## Date Started:   8 August 2022
## Latest Update: 13 June 2024

In [23]:
! pip list | grep python

ipython                   8.12.3
python-dateutil           2.9.0.post0
python-json-logger        2.0.7
types-python-dateutil     2.9.0.20240316


## 0. Define Metadata / Main Variables

In [24]:
# Define the name of the target class column here instead of manually typing it out everywhere
target_class_name = 6

# Fill in the names of what you want to call the 0 and 1 class
labels = ['inliers', 'outliers']

# Add directory as a string
rawdataDirectory = "../01-Data/Raw/"
dataDirectory = "../01-Data/"
dataFile = "thyroid.mat"

# Any exported artifacts will have this date
export_date = '202406'

### Key Variables
#### thyroidX - Individual test results
#### thyroidY - Final diagnostic result - Presence or absence of cancer 

## 1. Download data to directory - Done

## 2. Import relevant packages

In [25]:
import sklearn 
import scipy
from scipy import io
import numpy as np

import matplotlib 
%matplotlib inline

import pandas as pd
import dask

from scipy.io.arff import loadarff
import scipy.io as sio

from collections import Counter
from sklearn.preprocessing import MinMaxScaler

from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

from sklearn.metrics import accuracy_score, classification_report,confusion_matrix 
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, average_precision_score


## 3. Load Dataset

In [26]:
dataFile = rawdataDirectory + dataFile
print (dataFile)

../01-Data/Raw/thyroid.mat


In [27]:
# Load to dictionary
thyroidD = scipy.io.loadmat(dataFile)

## 4. Extract X and Y and Concatenate

### 4.1 Explore Dataset as Dictionary

In [28]:
# No of key-value pairs
len(thyroidD)

5

In [29]:
# "Iterating over a dictionary produces its' keys"
# The numeric data are stored in the values "X" and "y"
# Note: The value for X is a list of lists for diagnostic information
#       The value for y is a list of lists (length 1) giving the final diagnosis
[i for i in thyroidD]

['__header__', '__version__', '__globals__', 'X', 'y']

In [33]:
# Loop through both keys and values, by using the items() function:
for i, j in thyroidD.items():
    print(i, j)
# thyroidD

__header__ b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-05 13:11:25 UTC'
__version__ 1.0
__globals__ []
X [[7.74193548e-01 1.13207547e-03 1.37571157e-01 2.75700935e-01
  2.95774648e-01 2.36065574e-01]
 [2.47311828e-01 4.71698113e-04 2.79886148e-01 3.29439252e-01
  5.35211268e-01 1.73770492e-01]
 [4.94623656e-01 3.58490566e-03 2.22960152e-01 2.33644860e-01
  5.25821596e-01 1.24590164e-01]
 ...
 [9.35483871e-01 2.45283019e-02 1.60341556e-01 2.82710280e-01
  3.75586854e-01 2.00000000e-01]
 [6.77419355e-01 1.47169811e-03 1.90702087e-01 2.42990654e-01
  3.23943662e-01 1.95081967e-01]
 [4.83870968e-01 3.56603774e-03 1.90702087e-01 2.12616822e-01
  3.38028169e-01 1.63934426e-01]]
y [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [34]:
thyroidHdr = thyroidD["__header__"]
thyroidVrsn = thyroidD["__version__"]
thyroidGlobals = thyroidD["__globals__"]

print("thyroidHdr = ", thyroidHdr,"\n",
      "thyroidVrsn = ", thyroidVrsn,"\n",
      "thyroidGlobals = ", thyroidGlobals,"\n")

thyroidHdr =  b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-05 13:11:25 UTC' 
 thyroidVrsn =  1.0 
 thyroidGlobals =  [] 



In [35]:
items = list(thyroidD.items())
for key, value in items:
    print(f"Key: {key}, Value: {value}")

Key: __header__, Value: b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-05 13:11:25 UTC'
Key: __version__, Value: 1.0
Key: __globals__, Value: []
Key: X, Value: [[7.74193548e-01 1.13207547e-03 1.37571157e-01 2.75700935e-01
  2.95774648e-01 2.36065574e-01]
 [2.47311828e-01 4.71698113e-04 2.79886148e-01 3.29439252e-01
  5.35211268e-01 1.73770492e-01]
 [4.94623656e-01 3.58490566e-03 2.22960152e-01 2.33644860e-01
  5.25821596e-01 1.24590164e-01]
 ...
 [9.35483871e-01 2.45283019e-02 1.60341556e-01 2.82710280e-01
  3.75586854e-01 2.00000000e-01]
 [6.77419355e-01 1.47169811e-03 1.90702087e-01 2.42990654e-01
  3.23943662e-01 1.95081967e-01]
 [4.83870968e-01 3.56603774e-03 1.90702087e-01 2.12616822e-01
  3.38028169e-01 1.63934426e-01]]
Key: y, Value: [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


### 4.2 Extract X and y

In [36]:
#thyroid_X = pd.DataFrame(thyroidD["X"])
#print(type(thyroid_X))
#thyroid_X

In [43]:
# Store the features and target objects in their own variables (numpy arrays)
#         for easy retreival
thyroid_X=thyroidD['X']
thyroid_y=thyroidD['y']

thyroid_X.shape, thyroid_y.shape

((3772, 6), (3772, 1))

### 4.3 Concatenate the numpy arrays and Inspect

In [42]:
# Convert numpy arrays to a dataframe
thyroidDF=pd.DataFrame((np.concatenate((thyroid_X, thyroid_y), axis=1)))

pd.DataFrame.info(thyroidDF)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       3772 non-null   float64
 1   1       3772 non-null   float64
 2   2       3772 non-null   float64
 3   3       3772 non-null   float64
 4   4       3772 non-null   float64
 5   5       3772 non-null   float64
 6   6       3772 non-null   float64
dtypes: float64(7)
memory usage: 206.4 KB


#### Inspect rows

In [44]:
# Random sample of 5 records
thyroidDF.sample(5)

Unnamed: 0,0,1,2,3,4,5,6
2973,0.44086,0.002453,0.156546,0.191589,0.206573,0.214754,0.0
148,0.516129,0.001887,0.147059,0.212617,0.352113,0.159016,0.0
707,0.397849,0.000377,0.137571,0.107477,0.244131,0.106557,0.0
2199,0.333333,0.0,0.270398,0.317757,0.450704,0.193443,0.0
745,0.193548,0.034717,0.185009,0.240654,0.380282,0.168672,0.0


Lets check the head & tail to make sure there is nothing going on at the last row or the header

In [45]:
thyroidDF.head(3)

Unnamed: 0,0,1,2,3,4,5,6
0,0.774194,0.001132,0.137571,0.275701,0.295775,0.236066,0.0
1,0.247312,0.000472,0.279886,0.329439,0.535211,0.17377,0.0
2,0.494624,0.003585,0.22296,0.233645,0.525822,0.12459,0.0


In [46]:
thyroidDF.tail(3)

Unnamed: 0,0,1,2,3,4,5,6
3769,0.935484,0.024528,0.160342,0.28271,0.375587,0.2,0.0
3770,0.677419,0.001472,0.190702,0.242991,0.323944,0.195082,0.0
3771,0.483871,0.003566,0.190702,0.212617,0.338028,0.163934,0.0


#### No trouble with loading the data - both head and tail are clean

### Summary Stats

In [19]:
thyroidDF.describe()
# Looks like all the numbers are between 0 and 1

Unnamed: 0,0,1,2,3,4,5,0.1
count,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0,3772.0
mean,0.543121,0.008983,0.186826,0.248332,0.376941,0.177301,0.024655
std,0.20379,0.043978,0.070405,0.080579,0.087382,0.054907,0.155093
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.376344,0.001132,0.156546,0.203271,0.328638,0.14918,0.0
50%,0.569892,0.003019,0.190702,0.241822,0.375587,0.17377,0.0
75%,0.709677,0.004528,0.213472,0.28271,0.413146,0.196721,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Unique Value Checking

In [48]:
# Original code from recommended solution. Failed previously 
#  due to there being a repeated column name of "0"
#  SO use np concatenate rather than pd to avoid col name duplication
for col in thyroidDF.columns:
    print(col, len(thyroidDF[col].unique()))

0 93
1 280
2 72
3 243
4 141
5 324
6 2


#### All of the columns have a relatively small number of duplicate values. Except the last one. As the final diagnosis this should always be 0 or 1 

#### Identifying Bad Columns

In [49]:
def find_bad_columns_function(dataframe):
    '''
    Args: dataframe for which there maybe columns of concern that need to be fixed or deleted
    
    Logic: Find the columns that have 
    Null values
    blanks in the strings
    quasi constant/constant values defined by less than 1% variance
    
    Returns: 4 lists containing those features that have nulls, blanks, constant values throughout for numerical and categorical
    
    '''
    
    ###### Finding Null Values
    null_col_list = dataframe.columns[dataframe.isna().any()].tolist()
    
    print('Identified {} features with atleast one null'.format(
        len(null_col_list)))

    ###### Finding Blank Spaces in the object column
    # Non-obvious nulls such as blanks: The line items where there are spaces 
    blank_space_col_list = []
    object_columns = dataframe.select_dtypes(include=['object']).columns

    for col in object_columns:
        if sum(dataframe[col]==' '):
            blank_space_col_list.append(col)

    print('Identified {} features with at least one blank space'.format(
        len(blank_space_col_list)))
    
    ####### Finding Quasi Constant/Constant Value in numerical columns
    # Lets remove the variables that have more than 99% of their values as the same 
    # ie their standard deviation is less than 1 %
    
    numeric_df = dataframe._get_numeric_data()
    constant_numeric_col_list = [col for col in numeric_df.columns if numeric_df[col].std()<0.01]

    print('Identified {} numeric features that have quasi-constant values'.format(
        len(constant_numeric_col_list)))
    
    # We use a separate logic for the non-numerical variables because if you have closely varying float values
    # then the below code snippet wont pick it up
    
    ###### Finding Quasi Constant/Constant non_numeric value
    constant_non_numeric_col_list = []
    
    # Find the columns that are not in numeric_df
    non_numeric_col_set = set(dataframe.columns) - set(numeric_df.columns)   

    for col in non_numeric_col_set:
        categorical_mode_value = (dataframe[col].mode().values)[0]
        fractional_presence = sum(dataframe[col]==categorical_mode_value)/len(dataframe) 
    
        if fractional_presence > 0.99:
            constant_non_numeric_col_list.append(col)
            
    print('Identified {} non-numeric features that have quasi-constant values'.format(
        len(constant_non_numeric_col_list)))
    
    return null_col_list, blank_space_col_list, constant_numeric_col_list, constant_non_numeric_col_list

In [50]:
# use the above custom function to figure out the if there are any columns we need to be concerned about
null_col_list, blank_space_col_list, constant_numeric_col_list, \
constant_non_numeric_col_list = find_bad_columns_function(thyroidDF)

Identified 0 features with atleast one null
Identified 0 features with at least one blank space
Identified 0 numeric features that have quasi-constant values
Identified 0 non-numeric features that have quasi-constant values


### 4.4 Data Cleansing
#### No bad columns
#### No null values
#### No categorical values to encode (one-hot for stat analysis)

### 4.5 Export DataFrame as csv

In [51]:
dataDirectory = "../01-Data/Processed"
csvFile = "thyroid.csv"
csvdataFile = dataDirectory + csvFile
thyroidDF.to_csv(csvdataFile)