In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas

pandas.set_option('display.max_rows', None)

# Heart Disease Dataset

In [None]:
heart_df = pd.read_csv('./data/heart-statlog/data/heart-statlog_csv.csv')
print("COLUMNS", heart_df.columns)
print("SIZE",heart_df.size)
print(heart_df.describe)

In [None]:
heart_df.dtypes

In [None]:
# There is no missing data in heart dataset, where 0 means no missing data
heart_df.isna().sum()

In [None]:
print("Min age: {}".format(heart_df['age'].min()))
print("Max age: {}".format(heart_df['age'].max()))

In [None]:
sns.set(style="whitegrid")
sns.set_palette(sns.color_palette("Set1", n_colors=5))


pd.crosstab(heart_df['age'],heart_df['class']).plot(kind="bar",figsize=(20,6))
plt.title('Heart Disease Frequency for Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.set(style="whitegrid")
sns.set_palette(sns.color_palette("Set2", n_colors=5))

plt.figure(figsize=(15,12))
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,2,1)
plt.title("Distribution of diseased and not diseased patients.")
sns.countplot(x="class", data=heart_df)
plt.xlabel("Target (0 = not diseased, 1= diseased)")

plt.subplot(2,2,2)
plt.title("Distribution of male and female patients")
sns.countplot(x="sex", data=heart_df)
plt.xlabel("Gender (0 = female, 1= male)")
plt.show()

In [None]:
# find correlation - highly correlated features can be removed. With a threshold of 80%, no highly correlated features
plt.figure(figsize=(12,12))
sns.heatmap(heart_df.corr(),annot=True,cmap='YlGnBu', annot_kws={"size": 10})
plt.yticks(rotation = 0)
# plt.tight_layout()
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show() 
plt.show()

# Cervical Cancer Dataset

In [None]:
print("Cervical Cancer Dataset")
cervical_df = pd.read_csv('./data/cervical-cancer/data/cervical-cancer_csv.csv')
print("COLUMNS", cervical_df.columns)
print("SIZE",cervical_df.size)
print(cervical_df.describe)

In [None]:
# There's a lot of missing data in cervical dataset, where 0 means no missing data
cervical_df.isna().sum()

In [None]:
sns.set(style="whitegrid")
sns.set_palette(sns.color_palette("Set1", n_colors=5))


pd.crosstab(cervical_df['Age'],cervical_df['Biopsy']).plot(kind="bar",figsize=(20,6))
plt.title('Cervical Cancer Frequency for Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.set(style="whitegrid")
sns.set_palette(sns.color_palette("Set2", n_colors=5))

plt.figure(figsize=(15,12))
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,2,1)
plt.title("Distribution of diseased and not diseased patients.")
sns.countplot(x="Biopsy", data=cervical_df)
plt.xlabel("Target (0 = no cervical cancer, 1= cervical cancer)")

In [None]:
# find correlation - highly correlated features can be removed. With a threshold of 80%, no highly correlated features
plt.figure(figsize=(15,15))
sns.heatmap(cervical_df.corr(),annot=True,cmap='YlGnBu', annot_kws={"size": 6}, fmt='.2f')
plt.yticks(rotation = 0)
# plt.tight_layout()
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show() 
plt.show()

## Cleaning dataset (handling missing values)

In [None]:
# Drop columns when there are too many missing data ("STDs: Time since first/last diagnosis"),
# when there are dupplicate information in the table (e.g. "IUD" and "IUD (years)")
# or when there are only negative event in the dataset ("STDs:cervical condylomatosis" and "STDs:AIDS")
clean_cervical_df = cervical_df.drop([
    "STDs: Time since first diagnosis", 
    "STDs: Time since last diagnosis",
    "STDs",
    "IUD",
    "Smokes", 
    "Hormonal Contraceptives",
    "STDs:AIDS", 
    "STDs:cervical condylomatosis",
], axis=1)

# Replace some missing values by 0. when the occurence is lower than 2%
clean_cervical_df = clean_cervical_df.replace(
    { 
        "STDs:condylomatosis": np.nan,  
        "STDs:vaginal condylomatosis": np.nan, 
        "STDs:vulvo-perineal condylomatosis": np.nan, 
        "STDs:syphilis": np.nan, 
        "STDs:pelvic inflammatory disease": np.nan, 
        "STDs:genital herpes": np.nan, 
        "STDs:molluscum contagiosum": np.nan, 
        "STDs:HIV": np.nan, 
        "STDs:Hepatitis B": np.nan, 
        "STDs:HPV": np.nan
    }, 0.)

# Replace some missing values by the median
clean_cervical_df = clean_cervical_df.replace(
    {
        "Number of sexual partners": np.nan,
        "First sexual intercourse": np.nan,
        "Num of pregnancies": np.nan,
        "Smokes (years)": np.nan,
        "Smokes (packs/year)": np.nan,
        "Hormonal Contraceptives (years)": np.nan,
        "IUD (years)": np.nan,
        "STDs (number)": np.nan,
    }, 
    {
        "Number of sexual partners": clean_cervical_df["Number of sexual partners"].median(),
        "First sexual intercourse":  clean_cervical_df["First sexual intercourse"].median(),
         "Num of pregnancies":  clean_cervical_df["Num of pregnancies"].median(),
        "Smokes (years)":  clean_cervical_df["Smokes (years)"].median(),
        "Smokes (packs/year)":  clean_cervical_df["Smokes (packs/year)"].median(),
        "Hormonal Contraceptives (years)":  clean_cervical_df["Hormonal Contraceptives (years)"].median(),
        "IUD (years)":  clean_cervical_df["IUD (years)"].median(),
        "STDs (number)":  clean_cervical_df["STDs (number)"].median(),
    }
)


clean_cervical_df

In [None]:
# There's a lot of missing data in cervical dataset, where 0 means no missing data
clean_cervical_df.isna().sum()

In [None]:
# find correlation - highly correlated features can be removed. With a threshold of 80%, no highly correlated features
plt.figure(figsize=(15,15))
sns.heatmap(clean_cervical_df.corr(),annot=True,cmap='YlGnBu', annot_kws={"size": 6}, fmt='.2f')
plt.yticks(rotation = 0)
# plt.tight_layout()
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values
plt.show() 
plt.show()

## Save cleaned data

In [None]:
clean_cervical_df.to_csv('./data/cervical-cancer/data/clean_cervical-cancer_csv.csv', index=False)