# Project Preliminary Analysis
### Authors: Matthew Falcione

In [1]:
import requests
import zipfile
import pandas as pd
import numpy as np

### Download and extract dataset

In [3]:
# retrieve file from download directory and extract zip file
download_dir = 'C:\\Users\\matth\\OneDrive - Drexel University\\Other\\Documents\\temp\\'
cervical_cancer_dataset_path = download_dir + 'cervial_cancer_dataset.zip'
with zipfile.ZipFile(cervical_cancer_dataset_path, 'r') as zip_ref:
    zip_ref.extractall(download_dir)

In [4]:
# read in cervical cancer dataset and replace unknown values
df = pd.read_csv(download_dir + 'kag_risk_factors_cervical_cancer.csv')
df.replace('?', np.nan, inplace=True)

### Clean data and replace missing values

Covert all the columns that should be float or bool from string to their respective data type. Replace missing float column values with their median value, and replace the missing values in bool column to False (assuming that if it was left blank that is was False).

In [5]:
# update datatypes
float_columns = ["Number of sexual partners", "First sexual intercourse",
                 "Num of pregnancies", "Smokes (years)", "Smokes (packs/year)",
                 "Hormonal Contraceptives (years)", "IUD (years)", "STDs (number)", "STDs: Number of diagnosis",
                 "STDs: Time since first diagnosis", "STDs: Time since last diagnosis"]

# bool columns have binary values (1 or 0) so dtype set to bool
bool_columns = ['IUD', 'STDs', 'STDs:condylomatosis','STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
                'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease',
                'STDs:genital herpes','STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV','STDs:Hepatitis B',
                'STDs:HPV','Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology', 'Biopsy',
                'Hormonal Contraceptives', "Smokes"]
bool_type_dict = {col:bool for col in bool_columns}

# set all value to float to handle numbers with str formatting
df = df.astype(float)
# set boolean column values to bool
df = df.astype(bool_type_dict)

# replace columns values
# replace missing float columns values with their medians
for col in float_columns:
    df[col] = df[col].fillna(df[col].median())
# replace missing bool values with False
for col in bool_columns:
    df[col] = df[col].fillna(False)

In [6]:
# after investigation, no information on 'Dx' could be found so it was dropped
df.drop(columns='Dx', axis=1, inplace=True)

### Generate attribute summary table and correlation to `Biopsy` column

Calculate correlation of each column to `Biopsy` target value and display top 10 values that are most correlated.

In [7]:
# create dictionary to house correlation values to 'Biopsy' (goal value)
correlation_dict = {}
for col in df.columns:
    correlation_dict[col] = df['Biopsy'].corr(df[col])
    
# reshape and rename columns of correlation dataframe
df_correlation = pd.DataFrame(correlation_dict.items())
df_correlation.rename({0: 'Attribute', 1: "Correlation to Biopsy"}, inplace=True, axis='columns')
df_correlation.set_index('Attribute', inplace=True)

# drop biopsy correlation to itself sort in descending order
df_correlation.drop(index='Biopsy', axis=0, inplace=True)
df_correlation.sort_values(by='Correlation to Biopsy', inplace=True, ascending=False)

# ouput top ten most correlated attributes to biopsy
df_correlation_top_ten_index = df_correlation.head(10).index

Boolean columns will have NaN values for min, max, mean, and std in the table.

In [8]:
# use pandas built-in describe() function to calculate min, max, mean, and std
df_describe = df.describe(include='all').T
df_describe.drop(index='Biopsy', axis=0, inplace=True)

# add correlation to biopsy to the attribute summary table
df_describe['Correlation to Biopsy'] = df_correlation
df_describe.loc[df_correlation_top_ten_index, ['min', 'max', 'mean', 'std', 'Correlation to Biopsy']]

Unnamed: 0_level_0,min,max,mean,std,Correlation to Biopsy
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Schiller,,,,,0.733204
Hinselmann,,,,,0.547417
Citology,,,,,0.327466
Dx:Cancer,,,,,0.160905
Dx:HPV,,,,,0.160905
Dx:CIN,,,,,0.113172
STDs (number),0.0,4.0,0.155012,0.529617,0.103153
STDs: Number of diagnosis,0.0,3.0,0.087413,0.302545,0.097449
Hormonal Contraceptives (years),0.0,30.0,2.035331,3.56704,0.094164
Smokes (years),0.0,37.0,1.201241,4.060623,0.061204


Check to see how many positive cancer biopsies there were (55 positive subjects).

In [9]:
df['Biopsy'].value_counts()

False    803
True      55
Name: Biopsy, dtype: int64

For each highly-correlated boolean column, show the count of the true and false values since they do not have a good measure of min, max, mean, or std.

In [10]:
# get the count of the true and false value of each highly-correlated boolean column
boolean_columns_highest_correlation = ['Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Hinselmann', 'Schiller', 'Citology']
value_counts_series_list = []
for col in boolean_columns_highest_correlation:
    value_counts_series = df[col].value_counts()
    value_counts_series_list.append(value_counts_series)

In [11]:
# combine all value counts series and set the ket to be the column names
combined_value_counts_series = pd.concat(value_counts_series_list, keys=boolean_columns_highest_correlation)
# unstack the multiindex to get 'True' and 'False' as columns
combined_value_counts_series = combined_value_counts_series.unstack(level=1)
combined_value_counts_series

Unnamed: 0,False,True
Dx:Cancer,840,18
Dx:CIN,849,9
Dx:HPV,840,18
Hinselmann,823,35
Schiller,784,74
Citology,814,44


---

### Feature Selection

#### Genetic Algorithm Approach

#### Embedded
* Tree-based (Random Forest Importance Approach)
* Lasso (L1)

### Dimensionality Reduction / Visualization

#### Linear
* PCA
* LDA
* SVD

#### Non-linear
* t-SNE / Uniform Manifold Approximation and Projection (UMAP)
* Kernel-PCA
* Autoencoder

#### Classification Evaluation Metrics
* <https://towardsdatascience.com/the-ultimate-guide-to-binary-classification-metrics-c25c3627dd0a>
* <https://towardsdatascience.com/the-5-classification-evaluation-metrics-you-must-know-aa97784ff226>

Since classes are imblanced, don't use accuracy. Use F1, AUROC, MMC

False negative worse since they would not get screened (be cautious) 

#### Classification
* XGBoost/LightGBM

- Microsoft NNI / fastai