# Data exploration

## Imports

In [None]:
# Here are all of our imports used for data cleaning and manipulation

import pandas as pd
import numpy as np
import os
import shutil
import seaborn as sns
import matplotlib.pyplot as plt

## General exploration and some cleaning

In [None]:
# Reading in the data, with fixed lesion_id's
rel_path = "../data/metadata/fixed_metadata.csv"

df_pad_ufes = pd.read_csv(rel_path, index_col=0)

In [None]:
# Sorting the data by lesion_id to check for the new names we changed
df_pad_ufes = df_pad_ufes.sort_values(["lesion_id"]) 

cancer_conditions = ["BCC", "MEL", "SCC"]
no_cancer_conditions = ["ACK", "NEV", "SEK"]

def label_diagnostic(diagnostic):
    if diagnostic in cancer_conditions:
        return 'Skin cancer'
    elif diagnostic in no_cancer_conditions:
        return 'Skin disease'

# Applying the function to the "diagnostic" column to create a new "cancer_label" column
df_pad_ufes['cancer_label'] = df_pad_ufes['diagnostic'].apply(label_diagnostic)
df_pad_ufes

In [None]:
# Removing all the rows that doesn't have an entry in the "gender" column, as that"s what we're focusing on
df_pad_ufes = df_pad_ufes[~df_pad_ufes["gender"].isna()]

# Here we're dropping the duplicates, but keeping the first occurence of the lesion_id
df_pad_ufes = df_pad_ufes.drop_duplicates("lesion_id", keep = "first")

# Grouping the counts of the diagones and genders, then we're making a pivot table to make it easier to read
gender_diagnostic_counts = df_pad_ufes.groupby(["gender", "diagnostic"]).size().reset_index(name="count")
gender_diagnostic_pivot = gender_diagnostic_counts.pivot(index="diagnostic", columns="gender", values="count")

print(gender_diagnostic_pivot)
print(df_pad_ufes.shape)

In [None]:
# We define a custom order for the x-axis
custom_order = [
    "ACK (Skin disease)", "NEV (Skin disease)", "SEK (Skin disease)", 
    "BCC (Skin cancer)", "MEL (Skin cancer)", "SCC (Skin cancer)"
]

# We remake the pivot table from before to include the cancer_label in the x-axis, with their diagnostic
df_pad_ufes["diagnostic_info"] = df_pad_ufes["diagnostic"] + " (" + df_pad_ufes["cancer_label"] + ")"
gender_diagnostic_counts = df_pad_ufes.groupby(["gender", "diagnostic_info"]).size().reset_index(name="count")


# Here we plot
plt.figure(figsize=(10, 6))
plots = sns.barplot(data=gender_diagnostic_counts, x="diagnostic_info", y="count", hue="gender", palette="flare", order=custom_order)
plt.title("Distribution of the sexes and their diagnoses", fontsize=15)
plt.xlabel("Diagnoses", fontsize=13)
plt.ylabel("Frequency", fontsize=13)
plt.xticks(rotation=45)
legend = plt.legend(title="Sexes")
sns.set_style("whitegrid")
plt.tight_layout() 
plt.savefig("../analysis/plots/diagnoses_distribution.png", dpi=300, format="png", bbox_inches="tight")
plt.show()

### Checking for the instances where the lesion_id is shared between multiple patients
\
This should now be at 0, as we have fixed it, by replacing the wrong values in the `df_pad_ufes.csv`, so we're just skipping it.

In [None]:
duplicate_counts = df_pad_ufes.groupby("lesion_id")["patient_id"].agg({"count", "unique"})
duplicates_with_different_patient_id = duplicate_counts[(duplicate_counts["count"] > 1) & (duplicate_counts["unique"].apply(len) > 1)]

total_instances = duplicates_with_different_patient_id.shape[0]

print("total instances of duplicates with different patient_id's:", total_instances)
for lesion_id, data in duplicates_with_different_patient_id.iterrows():
    # prints if there are duplicates with different patient_id
    print("lesion_id:", lesion_id)
    print("different patient_id's", data["unique"])
    instances = df_pad_ufes[df_pad_ufes["lesion_id"] == lesion_id]

In [None]:
cancer_conditions = ["BCC", "MEL", "SCC"]
no_cancer_conditions = ["ACK", "NEV", "SEK"]
genders = ["FEMALE", "MALE"]

# Creating the categories that correspond to the Eike Petersen paper.
female_cancer = df_pad_ufes[(df_pad_ufes["diagnostic"].isin(cancer_conditions)) & (df_pad_ufes["gender"] == genders[0])].copy()
male_cancer = df_pad_ufes[(df_pad_ufes["diagnostic"].isin(cancer_conditions)) & (df_pad_ufes["gender"] == genders[1])].copy()

female_no_cancer = df_pad_ufes[(df_pad_ufes["diagnostic"].isin(no_cancer_conditions)) & (df_pad_ufes["gender"] == genders[0])].copy()
male_no_cancer = df_pad_ufes[(df_pad_ufes["diagnostic"].isin(no_cancer_conditions)) & (df_pad_ufes["gender"] == genders[1])].copy()

# Creating variables that contain the ids of the patients that have cancer and don"t have cancer
female_cancer_id = female_cancer["patient_id"]
male_cancer_id = male_cancer["patient_id"]
female_no_cancer_id = female_no_cancer["patient_id"]
male_no_cancer_id = male_no_cancer["patient_id"]


print("Total of women in the dataset (after cleaning the data:", len(female_cancer + female_no_cancer))
print("Total of men in the dataset (after cleaning the data):", len(male_cancer + male_no_cancer))
print("Total:", len(female_cancer + female_no_cancer + male_no_cancer + male_cancer))

In [None]:
# Checking how many patients have been biopsed
print(len(df_pad_ufes[df_pad_ufes["biopsed"] == True]))
print(len(df_pad_ufes[df_pad_ufes["biopsed"] == False]))
df_pad_ufes[df_pad_ufes["biopsed"] == False].head()

In [None]:
# Checking how many patients of each fitspatrick scale there are
print(len(df_pad_ufes[df_pad_ufes["fitspatrick"] == 1.0]))
print(len(df_pad_ufes[df_pad_ufes["fitspatrick"] == 2.0]))
print(len(df_pad_ufes[df_pad_ufes["fitspatrick"] == 3.0]))
print(len(df_pad_ufes[df_pad_ufes["fitspatrick"] == 4.0]))
print(len(df_pad_ufes[df_pad_ufes["fitspatrick"] == 5.0]))
print(len(df_pad_ufes[df_pad_ufes["fitspatrick"] == 6.0]))

In [None]:
pad1_and_2 = df_pad_ufes[(df_pad_ufes["fitspatrick"] == 1.0) | (df_pad_ufes["fitspatrick"] == 2.0)]
pad1_and_2.shape

In [None]:
pad_rest = df_pad_ufes[(df_pad_ufes["fitspatrick"] == 3.0) | (df_pad_ufes["fitspatrick"] == 4.0)| (df_pad_ufes["fitspatrick"] == 5.0)| (df_pad_ufes["fitspatrick"] == 6.0)]
pad_rest.shape

In [None]:
# Checking how many of each type of cancer there are based on their skin type (1 and 2)
print("Fitzpatrick 1 and 2 with cancer", len(pad1_and_2[pad1_and_2 ["cancer_label"] == "Skin cancer"]))
print("Fitzpatrick 1 and 2 without cancer", len(pad1_and_2[pad1_and_2 ["cancer_label"] == "Skin disease"]))

# Checking how many of each type of cancer there are based on their skin type (3, 4, 5, 6)
print("Fitzpatrick 3, 4, 5, 6 with cancer", len(pad_rest[pad_rest ["cancer_label"] == "Skin cancer"]))
print("Fitzpatrick 3, 4, 5, 6 without cancer", len(pad_rest[pad_rest ["cancer_label"] == "Skin disease"]))