In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: ignored

In [None]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

is_cancerous = {
    'nv': False,
    'mel': True,
    'bkl': False,
    'bcc': True,
    'akiec': False,
    'vasc': False,
    'df': False
}

def verify_col(df: pd.DataFrame, col_name: str):
    """
    Verify that a column is present in a dataframe.

    Parameters:
    - df: pd.DataFrame, the dataframe with the column.
    - col_name: str, the name of the column to check.

    Returns:
    - Bool: True if the column is present, False if not.
    """
    try:
        df[col_name]
        return True
    except KeyError as e:
        print(f"Error: {e}. The specified column '{col_name}' does not exist in the DataFrame.")
        return False

def reformat_col_name(col_name: str):
    """
    Reformat a column name for better readability.

    Parameters:
    - col_name: str, the name of the column to reformat.

    Returns:
    - str: The reformatted column name.
    """
    try:
        col_name_reformatted = col_name.capitalize()
        col_name_reformatted = col_name_reformatted.replace('_', ' ')
        return col_name_reformatted
    except:
        print(f'Couldn\'t reformat: {col_name}')
        return col_name


def create_donut_chart(df: pd.DataFrame, column_name: str, comp_title: str = '', color_dict=None):
    """
    Create a donut chart for a specific column in a DataFrame.

    Parameters:
    - df: DataFrame
    - column_name: str, the column for which the donut chart will be created
    - comp_title: str, title for the donut chart
    - color_dict: dict, a dictionary mapping values to colors for consistent coloring
    """
    # Check if the column is present in the dataframe
    has_col = verify_col(df, column_name)
    # If not, break
    if not has_col:
        return

    # Count the occurrences of each unique value in the specified column
    column_counts = df[column_name].value_counts()

    # If there are more than 4 distinct values, keep only the top 4 and group the rest into 'Other'
    if len(column_counts) > 4:
        top4_values = column_counts.index[:4]
        column_counts = column_counts[:4]
        column_counts['Other'] = df.loc[~df[column_name].isin(top4_values), column_name].count()

    # Create a figure and axis
    fig, ax = plt.subplots()

    # Plot the donut chart with specified colors
    if color_dict:
        colors = [color_dict.get(val) for val in column_counts.index]
        wedges, texts, autotexts = ax.pie(
            column_counts,
            labels=column_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            wedgeprops=dict(width=0.4),
            pctdistance=0.85,
            labeldistance=1.1,
            textprops=dict(color="w", weight='bold'),
            colors=colors)
    else:
        wedges, texts, autotexts = ax.pie(
            column_counts,
            labels=column_counts.index,
            autopct='%1.1f%%',
            startangle=90,
            wedgeprops=dict(width=0.4),
            pctdistance=0.85,
            labeldistance=1.1,
            textprops=dict(color="w", weight='bold'))

    # Draw a circle in the center to create a donut chart
    center_circle = plt.Circle((0, 0), 0.70, fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(center_circle)

    col_name_reformatted = reformat_col_name(column_name)

    # Equal aspect ratio ensures that the pie is drawn as a circle.
    ax.axis('equal')
    ax.legend(wedges, column_counts.index,
              title=col_name_reformatted,
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 1))

    # Display the chart
    if len(comp_title) == 0:
        title = f'Dataset distribution of {col_name_reformatted}'

    else:
        title = f'{comp_title}\nDistribution of {col_name_reformatted}'

    plt.title(title)
    plt.show()

def comparison_donuts(df: pd.DataFrame, donut_column_name: str, comparison_column_name: str):
    """
    Create multiple donut charts for a specified donut_column, grouped by the values in comparison_column.

    Parameters:
    - df: DataFrame
    - donut_column_name: str, the column for which the donut charts will be created
    - comparison_column_name: str, the column based on which the donut charts will be grouped
    """
    has_donut_col = verify_col(df, donut_column_name)
    has_comp_col = verify_col(df, comparison_column_name)

    # Create a color dictionary for consistent colors based on donut_column values
    unique_values = list(df[donut_column_name].unique())
    unique_values.append('Other')
    color_dict = {value: plt.cm.Paired(i) for i, value in enumerate(unique_values)}

    # If not, break
    if not (has_donut_col or has_comp_col):
        return

    for val in df[comparison_column_name].unique():
        comp_df = df[df[comparison_column_name] == val]
        comp_title = f'Comparing: {reformat_col_name(comparison_column_name)}, value: {reformat_col_name(val)}'
        create_donut_chart(comp_df, donut_column_name, comp_title=comp_title, color_dict=color_dict)


In [None]:
meta_data = pd.read_csv("HAM10000_metadata.csv")
meta_data.head()

# Add lesion type to image to meta_data
meta_data['cell_type'] = meta_data['dx'].map(lesion_type_dict.get)
meta_data['is_cancerous'] = meta_data['dx'].map(is_cancerous.get)

df = meta_data[['cell_type', 'is_cancerous', 'age', 'sex', 'localization']]

create_donut_chart(df, 'cell_type')

In [None]:
comparison_donuts(df, 'cell_type', 'sex')

In [None]:
# Set Seaborn style
sns.set_style('whitegrid')

# Create a figure and axes
fig, axes = plt.subplots(figsize=(12, 8))

# Plot the histogram
ax = sns.histplot(data=df, x='age', bins=20, color='#3498db', edgecolor='black')

# Customize plot appearance
ax.set_title('Distribution of Ages', fontsize=20, fontweight='bold')
ax.set_xlabel('Age', fontsize=16)
ax.set_ylabel('Frequency', fontsize=16)
ax.tick_params(axis='both', labelsize=16)
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Add a legend or additional annotations if needed
# ax.legend(labels=['Age'], loc='upper right', fontsize=12)

# Show the plot
plt.show()

In [None]:
# Create a figure and axes
fig, ax = plt.subplots(figsize=(12, 8))

# Get the order of localizations from least to most common
order = df['localization'].value_counts().index

# Generate a light-to-dark blue color palette
blue_palette = sns.color_palette("Blues_r", n_colors=len(order))

# Plot the bar plot using Seaborn's countplot with specified order
sns.countplot(data=df, x='localization', ax=ax, order=order, palette = blue_palette)

# Customize plot appearance
ax.set_title('Distribution of Localization', fontsize=20, fontweight='bold')
ax.set_xlabel('Localization', fontsize=16)
ax.set_ylabel('Frequency', fontsize=16)
ax.tick_params(axis='both', labelsize=16)
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Rotate x-axis labels for better readability
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

# Show the plot
plt.show()


In [None]:
sns.set_style('whitegrid')
fig,axes = plt.subplots(figsize=(12,8))
ax = sns.kdeplot(data=meta_data, x='age',hue='cell_type')
ax.set_xlim([0, 100])
plt.title('Lesion type by age')
plt.show()