In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to plot a donut chart for a given DataFrame and save it
def plot_and_save_donut_chart(df, title, hex_colors, filename):
    # Store information about variable types and counts
    data_info = {'Variable_Type': [], 'Count': []}
    
    # Analyze DataFrame
    for column in df.columns:
        data_info['Variable_Type'].append(df[column].dtype)
        data_info['Count'].append(df[column].nunique())

    # Create a DataFrame from the collected information
    data_info_df = pd.DataFrame(data_info)

    # Replace data types
    data_info_df['Variable_Type'] = data_info_df['Variable_Type'].replace({'float64': 'Float', 'int64': 'Integer', 'object': 'Categorical'})

    # Aggregate counts by variable type
    agg_data_info = data_info_df.groupby('Variable_Type')['Count'].sum()

    # Plot as a donut chart with custom colors and increased font size
    plt.figure(figsize=(8, 8))  # Adjust the size here
    plt.pie(agg_data_info, labels=agg_data_info.index, autopct='%1.1f%%', startangle=90, colors=hex_colors, textprops={'fontsize': 18, 'fontweight': 'bold'})  # Increase font size here
    plt.title(title, fontsize=24, pad=30, fontweight='bold')  # Increase font size and add more space around the title here
    plt.gca().add_artist(plt.Circle((0,0),0.70,fc='white')) # Add a circle in the middle to create a donut chart
    plt.axis('equal')
    
    # Save the plot as an image
    plt.savefig(filename, bbox_inches='tight')  # Use bbox_inches='tight' to adjust the bounding box and avoid cutting off labels
    plt.close()

# File paths for the three datasets
file_path_01 = '../cleaned_data/new-cleaned_data-01.csv'
file_path_02 = '../cleaned_data/cleaning_data-02.csv'
file_path_03 = '../cleaned_data/cleaned_data-03.csv'

# Read the three datasets into data frames
df_01 = pd.read_csv(file_path_01)
df_02 = pd.read_csv(file_path_02)
df_03 = pd.read_csv(file_path_03)

# Hex colors for each dataset
hex_colors_01 = [
    "#F7D290", "#2D3740", "#FE5C4F", "#E7943F", "#F7D290", "#635B4E", "#B09471", "#AC5448", "#E7E6E6", "#FFFFFF"
]
hex_colors_02 = [
    "#FE5C4F", "#E7943F", "#F7D290", "#2D3740", "#F7D290", "#635B4E", "#B09471", "#AC5448", "#E7E6E6", "#FFFFFF"
]
hex_colors_03 = [
    "#E7943F", "#F7D290", "#2D3740", "#FE5C4F", "#F7D290", "#635B4E", "#B09471", "#AC5448", "#E7E6E6", "#FFFFFF"
]

# Plot and save donut charts for each dataset
plot_and_save_donut_chart(df_01, 'Dataset 01', hex_colors_01, 'data_donut_chart_01.png')
plot_and_save_donut_chart(df_02, 'Dataset 02', hex_colors_02, 'data_donut_chart_02.png')
plot_and_save_donut_chart(df_03, 'Dataset 03', hex_colors_03, 'data_donut_chart_03.png')


In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to plot a combined donut chart for all datasets and save it
def plot_and_save_combined_donut_chart(df_list, titles, hex_colors_list, filename):
    # Store information about variable types and counts for combined datasets
    data_info = {'Variable_Type': [], 'Count': []}
    
    # Analyze combined DataFrame
    for df, title in zip(df_list, titles):
        for column in df.columns:
            data_info['Variable_Type'].append(df[column].dtype)
            data_info['Count'].append(df[column].nunique())

    # Create a DataFrame from the collected information
    data_info_df = pd.DataFrame(data_info)

    # Replace data types
    data_info_df['Variable_Type'] = data_info_df['Variable_Type'].replace({'float64': 'Float', 'int64': 'Integer', 'object': 'Categorical'})

    # Aggregate counts by variable type
    agg_data_info = data_info_df.groupby('Variable_Type')['Count'].sum()

    # Plot as a combined donut chart with custom colors and increased font size
    plt.figure(figsize=(8, 8))  # Adjust the size here
    plt.pie(agg_data_info, labels=agg_data_info.index, autopct='%1.1f%%', startangle=90, colors=hex_colors_list[0], textprops={'fontsize': 18, 'fontweight': 'bold'})  # Increase font size here
    plt.title('Combined Datasets', fontsize=24, pad=30, fontweight='bold')  # Increase font size and add more space around the title here
    plt.gca().add_artist(plt.Circle((0,0),0.70,fc='white')) # Add a circle in the middle to create a donut chart
    plt.axis('equal')
    
    # Save the plot as an image
    plt.savefig(filename, bbox_inches='tight')  # Use bbox_inches='tight' to adjust the bounding box and avoid cutting off labels
    plt.close()

# File paths for the three datasets
file_path_01 = '../cleaned_data/new-cleaned_data-01.csv'
file_path_02 = '../cleaned_data/cleaning_data-02.csv'
file_path_03 = '../cleaned_data/cleaned_data-03.csv'

# Read the three datasets into data frames
df_01 = pd.read_csv(file_path_01)
df_02 = pd.read_csv(file_path_02)
df_03 = pd.read_csv(file_path_03)

# Combine the datasets into a list
df_list = [df_01, df_02, df_03]

# Titles for each dataset
titles = ['Dataset 01', 'Dataset 02', 'Dataset 03']

# Hex colors for each dataset
hex_colors_list = [
    ["#F7D290", "#2D3740", "#FE5C4F", "#E7943F", "#F7D290", "#635B4E", "#B09471", "#AC5448", "#E7E6E6", "#FFFFFF"],
]

# Plot and save combined donut chart for all datasets
plot_and_save_combined_donut_chart(df_list, titles, hex_colors_list, 'combined_data_donut_chart.png')
