In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

# Create a directory to save the plots if it doesn't exist
save_dir = "cancer_plots"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    print(f"Created directory: {save_dir}")
else:
    print(f"Directory already exists: {save_dir}")

# Load the data from Excel
xlsx = pd.ExcelFile('FinalDataset COPY.xlsx')
df = pd.read_excel(xlsx, sheet_name='Sheet1')
df.columns = [str(col).strip() for col in df.columns]

def convert_and_plot(rate_col, prefix, title_prefix):
    """
    Convert the given cancer rate and related checking columns to numeric 
    and plot the relationships, saving each plot as an individual file.
    """
    relationship_cols = [ 
        '% ' + prefix + ' Patients Reporting Depression',
        '% ' + prefix + ' Patients With High Blood Pressure',
        '% ' + prefix + ' Patients With High Cholesterol',
        '% ' + prefix + ' Patients With Coronary Heart Disease',
        '% ' + prefix + ' Patients That Had a Medical Checkup between 2022-2023',
        '% ' + prefix + ' Who Currently Smoke',
        '% ' + prefix + ' Patients Who Are Obese (BMI>30)',
        '% ' + prefix + ' Patients Who Report Physical Inactivity',
        '% ' + prefix + ' Patients Who Report Binge Drinking',
        '% ' + prefix + ' Patients Who Report <7Hrs/Night Sleep',
        '% ' + prefix + ' Patients With History of Cancer Diagnosis'
    ]
    
    # Convert columns to numeric
    cols_to_convert = [rate_col] + relationship_cols
    for col in cols_to_convert:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            print('Missing column: ' + col)
    
    # Create individual plots and save each one
    for y_col in relationship_cols:
        if rate_col in df.columns and y_col in df.columns:
            plt.figure(figsize=(10, 6))
            sns.regplot(x=rate_col, y=y_col, data=df, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
            plt.title(title_prefix + ': ' + rate_col + ' vs ' + y_col)
            plt.xlabel(rate_col)
            plt.ylabel(y_col)
            plt.tight_layout()
            
            # Create a safe filename by replacing spaces and special characters
            safe_y_col = y_col.replace('%', 'Percent').replace(' ', '_').replace('/', '_')
            safe_rate_col = rate_col.replace(' ', '_')
            filename = f"{save_dir}/{title_prefix.replace(' ', '_')}_{safe_rate_col}_vs_{safe_y_col}.png"
            
            plt.savefig(filename, dpi=300)
            plt.close()  # Close the figure to free memory
            print(f"Saved: {filename}")
    
    print(f"All {title_prefix} plots saved successfully.")

# Breast Cancer
print("Plotting and saving Breast Cancer relationships")
convert_and_plot('Breast Cancer Rate', 'Breast Cancer', 'Breast Cancer')

# Lung, Prostate, and Melanoma
cancer_types = {
    'Lung': {'rate': 'Lung Cancer Rate', 'prefix': 'Lung Cancer'},
    'Prostate': {'rate': 'Prostate Cancer Rate', 'prefix': 'Prostate Cancer'},
    'Melanoma': {'rate': 'Melanoma Rate', 'prefix': 'Melanoma'}
}

# For Melanoma, check if 'Melanoma Rate' exists; if not, try to guess one based on available melanoma columns.
if 'Melanoma Rate' not in df.columns:
    melanoma_columns = [col for col in df.columns if 'Melanoma' in col]
    if melanoma_columns:
        possible_rate = [col for col in melanoma_columns if 'Rate' in col]
        if possible_rate:
            cancer_types['Melanoma']['rate'] = possible_rate[0]
        else:
            cancer_types['Melanoma']['rate'] = melanoma_columns[0]
    else:
        print('No melanoma related columns found.')

# Loop over lung, prostate, and melanoma
for cancer, params in cancer_types.items():
    print(f"Plotting and saving {cancer} cancer relationships")
    convert_and_plot(params['rate'], params['prefix'], cancer + " Cancer")
    
print("All graphs produced and saved successfully.")
print(f"You can find all the plots in the '{save_dir}' directory.")

Directory already exists: cancer_plots
Plotting and saving Breast Cancer relationships
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_Reporting_Depression.png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_With_High_Blood_Pressure.png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_With_High_Cholesterol.png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_With_Coronary_Heart_Disease.png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_That_Had_a_Medical_Checkup_between_2022-2023.png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Who_Currently_Smoke.png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_Who_Are_Obese_(BMI>30).png
Saved: cancer_plots/Breast_Cancer_Breast_Cancer_Rate_vs_Percent_Breast_Cancer_Patients_Who_Report_Phy

In [2]:
import os
print("Your current working directory is:")
print(os.getcwd())
print("\
Contents of the cancer_plots folder:")
print(os.listdir("cancer_plots")[:5])  # Show first 5 files as example
print(f"Total number of files: {len(os.listdir('cancer_plots'))}")

Your current working directory is:
/Users/matthewriedl
Contents of the cancer_plots folder:
['Lung_Cancer_Lung_Cancer_Rate_vs_Percent_Lung_Cancer_Patients_Reporting_Depression.png', 'Melanoma_Cancer_%_Melanoma_Patients_Reporting_Depression_vs_Percent_Melanoma_Patients_Who_Report_Binge_Drinking.png', 'Lung_Cancer_Lung_Cancer_Rate_vs_Percent_Lung_Cancer_Patients_That_Had_a_Medical_Checkup_between_2022-2023.png', 'Melanoma_Cancer_%_Melanoma_Patients_Reporting_Depression_vs_Percent_Melanoma_Patients_With_History_of_Cancer_Diagnosis.png', 'Lung_Cancer_Lung_Cancer_Rate_vs_Percent_Lung_Cancer_Patients_With_Coronary_Heart_Disease.png']
Total number of files: 41
