In [None]:
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.proportion import proportions_ztest
from itertools import combinations
from scipy.stats import chi2_contingency
# sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 400
sns.set_theme("notebook")

In [None]:
patients = pd.read_csv("experiment_data/prostate_tumours_patients.csv")
tumours = pd.read_csv("experiment_data/prostate_tumours.csv", low_memory=False)

In [None]:
print(patients.columns)
print(tumours.columns)

In [None]:
ethnicities = pd.read_excel("../../data/simulacrum_v2.1.0/Documents/all_z_lookup_tables.xlsx", sheet_name='z_ethnicity')

ethnicities

In [None]:
ethnicity_mapper = {
    '0': 'White',
    '8': 'Other',
    'A': 'White British',
    'B': 'White Irish',
    'C': 'Any other White background',
    'CA': 'English',
    'CH': 'Turkish',
    'CP': 'Polish',
    'D': 'White and Black Caribbean',
    'E': 'White and Black African',
    'F': 'White and Asian',
    'G': 'Any other mixed background',
    'H': 'Asian Indian',
    'J': 'Asian Pakistani',
    'K': 'Asian Bangladeshi',
    'L': 'Any other Asian background',
    'M': 'Black Caribbean',
    'N': 'Black African',
    'P': 'Any other Black background',
    'R': 'Chinese',
    'S': 'Any other ethnic group',
    'X': 'Not known',
    'Z': 'Not specified' 
}

## 1. Distribution of Ethnicities

In [None]:
# shrink the ethnicities
def categorize_ethnicity(ethnicity):
    if ethnicity in ['White', 'White British', 'White Irish', 'Any other White background', 'English', 'Polish']:
        return 'White'
    elif ethnicity in ['Asian Indian', 'Asian Pakistani', 'Asian Bangladeshi', 'Any other Asian background', 'Chinese']:
        return 'Asian'
    elif ethnicity in ['Black Caribbean', 'Black African', 'Any other Black background']:
        return 'Black'
    elif ethnicity in ['White and Black Caribbean', 'White and Black African', 'White and Asian', 'Any other mixed background']:
        return 'Mixed'    
    elif ethnicity in ['Turkish']:
        return 'Turkish'
    elif ethnicity in ['Any other ethnic group']:
        return 'Other'
    else:
        return 'Unknown'

# Map the ethnicity codes to their descriptions
patients['ETHNICITY_DESC'] = patients['ETHNICITY'].map(ethnicity_mapper)
patients['ETHNIC_GROUP'] = patients['ETHNICITY_DESC'].apply(categorize_ethnicity)
plt.figure()
sns.barplot(patients['ETHNIC_GROUP'].value_counts())
plt.title("Distribution of Ethnic Groups")
plt.xlabel("Ethnic Group")
plt.ylabel("No. of Patients")
plt.show()

In [None]:
combined_data = patients.merge(tumours, on='PATIENTID', how='inner')
print(patients.shape)
print(tumours.shape)
print(combined_data.shape)

In [None]:
combined_data = combined_data[combined_data['ETHNIC_GROUP']!='Unknown']
combined_data.columns

In [None]:
analysis_cols = ['ETHNIC_GROUP', 'AGE', 'GLEASON_PRIMARY', 'GLEASON_SECONDARY', 'GLEASON_TERTIARY', 'GLEASON_COMBINED', 'GRADE', 'STAGE_BEST', 'T_BEST', 'M_BEST', 'N_BEST']
analysis_data = combined_data[analysis_cols]

plt.figure()
sns.barplot(analysis_data['ETHNIC_GROUP'].value_counts())
plt.title("Distribution of Ethnic Groups")
plt.xlabel("Ethnic Group")
plt.ylabel("No. of Patients")
plt.show()

## 2. Is there a notable variation for age at dianosis across ethnicities?

In [None]:
analysis_data.groupby('ETHNIC_GROUP')['AGE'].agg(['count', 'mean', 'std', 'median', 'min', 'max'])

In [None]:
sns.violinplot(x='ETHNIC_GROUP', y='AGE', data=analysis_data,
               order=['Asian', 'Black', 'Mixed', 'Other', 'White'], inner='box')
plt.title('Distribution of Age at Diagnosis by Ethnic Group')
plt.ylabel('Age')
plt.xlabel('Ethnic Group')
plt.xticks(rotation=45)
plt.tight_layout()
# plt.savefig('age_by_ethnicity_violin.png')
plt.show()

In [None]:
# ANOVA test for significant differences across ages
model = ols('AGE ~ C(ETHNIC_GROUP)', data=analysis_data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

In [None]:
# post-hoc tests
posthoc = pairwise_tukeyhsd( analysis_data['AGE'], analysis_data['ETHNIC_GROUP'],alpha=0.05)

print(posthoc)

Statistically, there is no evidence that ethnicity affects the age at diagnosis, though it's close (0.08).

Therefore, we cannot infer that men from a particular ethnic group are diagnosed at an earlier or later age.


## 3. How does the Gleason score at Diagnostic interact with ethnicity?

In [None]:
analysis_data[analysis_data['GLEASON_COMBINED'].isnull()]

In [None]:
analysis_data[['GLEASON_PRIMARY', 'GLEASON_PRIMARY', 'GLEASON_TERTIARY', 'GLEASON_COMBINED']]

In [None]:
def assign_grade_group(row):
    """
    Create Tumour Grade Group based on the GLEASON SCORE as defined here:

     RESOURCE: https://www.cancerresearchuk.org/about-cancer/prostate-cancer/stages/grades
    """
    primary = row['GLEASON_PRIMARY']
    secondary = row['GLEASON_SECONDARY']
    if pd.isna(primary) or pd.isna(secondary):
        return np.nan
    gleason_sum = primary + secondary
    if gleason_sum <= 6: return 1
    elif gleason_sum == 7: return 2 if primary == 3 else 3
    elif gleason_sum == 8: return 4
    elif gleason_sum >= 9: return 5
    return np.nan

analysis_data['GRADE_GROUP'] = analysis_data.apply(assign_grade_group, axis=1)

grade_labels = {
    1: 'Grade 1 (Low)', 2: 'Grade 2 (Intermediate)', 3: 'Grade 3 (Intermediate)',
    4: 'Grade 4 (High)', 5: 'Grade 5 (Very High)'}

analysis_data['GRADE_LABEL'] = analysis_data['GRADE_GROUP'].map(grade_labels)

In [None]:
grade_dist = pd.crosstab(analysis_data['ETHNIC_GROUP'], analysis_data['GRADE_LABEL'], normalize='index')
risk_colormap = ['#2ca02c', '#ff7f0e', '#d62728', '#9467bd', '#8c564b'] # Custom colors

ax = grade_dist.plot(kind='bar', stacked=True, figsize=(12, 7), color=risk_colormap, edgecolor='white')
plt.title('Distribution of Prostate Cancer Grade by Ethnicity', fontsize=16)
plt.ylabel('Proportion of Patients', fontsize=12)
plt.xlabel('Ethnic Group', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Grade Group', bbox_to_anchor=(1.02, 1))
plt.tight_layout()
# plt.savefig('cancer_grade_groups_by_ethnicity.png')
plt.show()


In [None]:
# Test for statistical significance
contingency_table = pd.crosstab(analysis_data['ETHNIC_GROUP'], analysis_data['GRADE_LABEL'])
chi2, p_value, _, _ = chi2_contingency(contingency_table)
print(f"Chi-Square Test for Grade vs. Ethnicity: p-value = {p_value:.4f}")
if p_value < 0.05:
    print("Conclusion: There is a statistically significant association between ethnicity and cancer grade.")
else:
    print("Conclusion: There is no statistically significant association between ethnicity and cancer grade.")

In [None]:
# proportion of high-risk grade by ethnic group
analysis_data['HIGH_RISK'] = analysis_data['GRADE_GROUP'].isin([4,5])
high_risk_by_ethnicity = analysis_data.groupby('ETHNIC_GROUP')['HIGH_RISK'].mean()
plt.figure(figsize=(12, 6))
high_risk_by_ethnicity.plot(kind='bar', color='firebrick')
plt.title('Proportion of High-Risk Cancer Grade by Ethnicity', fontsize=16)
plt.xlabel('Ethnic Group', fontsize=14)
plt.ylabel('Proportion with High risk Gleason', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()
# plt.savefig('high_risk_cancer_grade_by_ethnicity.png')
plt.show()


### 4. How Does Vitalstatus Interact with Ethnicity?

In [None]:
def categorize_vital_status(code):
    if code in ['D', 'D3', 'D4', 'D5']:
        return 'Dead'
    elif code == 'A':
        return 'Alive'
    elif code in ['X', 'X2', 'X4', 'X5']:
        return 'Censored (Lost/Embarked)'
    else:
        return 'Unknown'

combined_data['OUTCOME_GROUP'] = combined_data['VITALSTATUS'].apply(categorize_vital_status)


combined_data['DIAGNOSISDATEBEST'] = pd.to_datetime(combined_data['DIAGNOSISDATEBEST'])
combined_data['VITALSTATUSDATE'] = pd.to_datetime(combined_data['VITALSTATUSDATE'])

combined_data['SURVIVAL_MONTHS'] = (combined_data['VITALSTATUSDATE'] - combined_data['DIAGNOSISDATEBEST']).dt.days / 30
combined_data.dropna(subset=['SURVIVAL_MONTHS'], inplace=True) 


combined_data['EVENT'] = np.where(combined_data['OUTCOME_GROUP'] == 'Dead', 1, 0)

survival_data = combined_data[['ETHNIC_GROUP', 'SURVIVAL_MONTHS', 'EVENT', 'OUTCOME_GROUP']]

survival_data.head()


In [None]:
survival_data['OUTCOME_GROUP'].value_counts()

In [None]:
outcome_distribution = pd.crosstab(
    survival_data['ETHNIC_GROUP'],
    survival_data['OUTCOME_GROUP'],
    normalize='index' 
)

outcome_distribution = outcome_distribution.sort_values('Dead', ascending=False)

# stacked bar chart
ax = outcome_distribution.plot(
    kind='bar',
    stacked=True,
    figsize=(12, 7),
    colormap='viridis',
    edgecolor='white'
)

plt.title('Final Outcome Distribution by Ethnic Group', fontsize=16)
plt.ylabel('Proportion of Patients', fontsize=12)
plt.xlabel('Ethnic Group', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Outcome', bbox_to_anchor=(1.02, 1))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 9))

sns.violinplot(
    data=survival_data,
    x='ETHNIC_GROUP',
    y='SURVIVAL_MONTHS',
    hue='OUTCOME_GROUP',
    palette={'Dead': '#d62728', 'Alive': '#2ca02c', 'Censored (Lost/Embarked)': '#ff7f0e'},
    split=True,
    inner='quartile', 
    linewidth=1.5
)

plt.title('Distribution of Survival Months by Outcome and Ethnicity', fontsize=18)
plt.xlabel('Ethnic Group', fontsize=14)
plt.ylabel('Follow-up Time (Months)', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Final Outcome', bbox_to_anchor=(1.02, 1))
plt.ylim(0, 100) 
plt.tight_layout()
plt.show()

## PSA

In [None]:
# RESOURCE: https://www.cancerresearchuk.org/about-cancer/tests-and-scans/prostate-specific-antigen-psa-test

# PSA Information nod found in the data






### 5. At what stage are patients from different ethnic groups diagnosed?

In [None]:
stage_distribution = pd.crosstab(
    analysis_data['ETHNIC_GROUP'],
    analysis_data['STAGE_BEST'],
    normalize='index'
)

stage_distribution


In [None]:
plt.figure(figsize=(14, 7))
stage_distribution.plot(kind='bar', stacked=True, colormap='plasma')
plt.title('Cancer Stage Distribution by Ethnicity', fontsize=16)
plt.xlabel('Ethnic Group', fontsize=14)
plt.ylabel('Proportion', fontsize=14)
plt.legend(title='Cancer Stage', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()
# plt.savefig('stage_by_ethnicity.png')
plt.show()

In [None]:
stage_mapper =  {
  '0': 'Stage 0',
 '0A': 'Stage 0',
 '0IS': 'Stage 0',
 '1': 'Stage 1',
 '1A': 'Stage 1',
 '1A1': 'Stage 1',
 '1A2': 'Stage 1',
 '1A3': 'Stage 1',
 '1AE': 'Stage 1',
 '1AES': 'Stage 1',
 '1AEX': 'Stage 1',
 '1AX': 'Stage 1',
 '1B': 'Stage 1',
 '1B1': 'Stage 1',
 '1B2': 'Stage 1',
 '1BE': 'Stage 1',
 '1C': 'Stage 1',
 '1C1': 'Stage 1',
 '1C2': 'Stage 1',
 '1C3': 'Stage 1',
 '1E': 'Stage 1',
 '1EX': 'Stage 1',
 '1S': 'Stage 1',
 '1X': 'Stage 1',
 '2': 'Stage 2',
 '2A': 'Stage 2',
 '2A1': 'Stage 2',
 '2A2': 'Stage 2',
 '2AE': 'Stage 2',
 '2AEX': 'Stage 2',
 '2AS': 'Stage 2',
 '2AX': 'Stage 2',
 '2B': 'Stage 2',
 '2BEX': 'Stage 2',
 '2BX': 'Stage 2',
 '2C': 'Stage 2',
 '2E': 'Stage 2',
 '2S': 'Stage 2',
 '2X': 'Stage 2',
 '3': 'Stage 3',
 '3A': 'Stage 3',
 '3A1': 'Stage 3',
 '3A1i': 'Stage 3',
 '3A1ii': 'Stage 3',
 '3A2': 'Stage 3',
 '3AE': 'Stage 3',
 '3AES': 'Stage 3',
 '3Ai': 'Stage 3',
 '3Aii': 'Stage 3',
 '3AS': 'Stage 3',
 '3AX': 'Stage 3',
 '3B': 'Stage 3',
 '3BE': 'Stage 3',
 '3Bii': 'Stage 3',
 '3BX': 'Stage 3',
 '3C': 'Stage 3',
 '3C1': 'Stage 3',
 '3C2': 'Stage 3',
 '3D': 'Stage 3',
 '3E': 'Stage 3',
 '3ES': 'Stage 3',
 '3S': 'Stage 3',
 '4': 'Stage 4',
 '4A': 'Stage 4',
 '4AE': 'Stage 4',
 '4AES': 'Stage 4',
 '4AEXS': 'Stage 4',
 '4AS': 'Stage 4',
 '4AX': 'Stage 4',
 '4B': 'Stage 4',
 '4BE': 'Stage 4',
 '4BES': 'Stage 4',
 '4BEX': 'Stage 4',
 '4BEXS': 'Stage 4',
 '4BS': 'Stage 4',
 '4BX': 'Stage 4',
 '4BXS': 'Stage 4',
 '4C': 'Stage 4',
 '4E': 'Stage 4',
 '4ES': 'Stage 4',
 '4S': 'Stage 4',
 '4X': 'Stage 4',
 '5': 'Outdated or invalid code',
 '?': 'Insufficient information',
 'U': 'Unstageable',
 'X': 'Not staged',
 'A': 'RaiBinet stage',
 'B': 'RaiBinet stage',
 'C': 'RaiBinet stage',
 'L1': 'Unmapped value',
 'L2': 'Unmapped value',
 'M': 'Unmapped value',
 'M0': 'Unmapped value',
 'M1': 'Unmapped value',
 'M2': 'Unmapped value',
 'M3': 'Unmapped value',
 'MS': 'Unmapped value'}

In [None]:
analysis_data['Cancer Stage']=analysis_data['STAGE_BEST'].map(stage_mapper)

In [None]:
# Distribution of cancer stages by ethnic group
stage_distribution = pd.crosstab(
    analysis_data['ETHNIC_GROUP'],
    analysis_data['Cancer Stage'],
    normalize='index'
)

stage_distribution


In [None]:
plt.figure(figsize=(14, 7))
stage_distribution.plot(kind='bar', stacked=True, colormap='plasma')
plt.title('Cancer Stage Distribution by Ethnicity', fontsize=16)
plt.xlabel('Ethnic Group', fontsize=14)
plt.ylabel('Proportion', fontsize=14)
plt.legend(title='Cancer Stage', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.3)
plt.tight_layout()
# plt.savefig('stage_by_ethnicity.png')
plt.show()

In [None]:
# chi-square test
stage_contingency = pd.crosstab(analysis_data['ETHNIC_GROUP'], analysis_data['Cancer Stage'])
chi2_stage, p_stage, dof_stage, expected_stage = stats.chi2_contingency(stage_contingency)
print("\n\nChi-square test for Cancer Stage by Ethnicity:")
print(f"Chi-square statistic: {chi2_stage:.2f}")
print(f"p-value: {p_stage:.4f}")
if p_stage < 0.05:
    print("There is a significant association between ethnicity and cancer stage at diagnosis.")
else:
    print("There is no significant association between ethnicity and cancer stage at diagnosis.")

In [None]:
# analyse advanced cancer stages only
plt.figure(figsize=(14, 8))
stage_by_ethnicity = pd.crosstab(
    analysis_data['ETHNIC_GROUP'], 
    analysis_data['Cancer Stage'],
    normalize='index'
)

valid_stages = ['Stage 1', 'Stage 2', 'Stage 3', 'Stage 4'] # COULDN'T FIND LITERATURE ON RaiBinet THUS EXCLUDING IT
critical_stages = [stage for stage in valid_stages if stage in stage_by_ethnicity.columns]
stage_by_ethnicity = stage_by_ethnicity[valid_stages]


ax = stage_by_ethnicity.plot(
    kind='bar', 
    stacked=True, 
    figsize=(14, 8),
    colormap='YlOrRd',  )

plt.title('Cancer Stage Distribution by Ethnicity at Diagnosis', fontsize=18, pad=20)
plt.xlabel('Ethnic Group', fontsize=16)
plt.ylabel('Proportion of Patients', fontsize=16)
plt.legend(title='Cancer Stage', 
           title_fontsize=14, 
           fontsize=12, 
           bbox_to_anchor=(1.05, 1),
           loc='upper left')
plt.xticks(rotation=45, ha='right', fontsize=14)
plt.yticks(fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)

for container in ax.containers:
    ax.bar_label(container, labels=['%.2f%%' % (x*100) for x in container.datavalues], 
                 label_type='center', fontsize=10)

plt.tight_layout()
# plt.savefig('detailed_stage_by_ethnicity.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
analysis_data=analysis_data[analysis_data['Cancer Stage'].isin(valid_stages)]

stage_contingency = pd.crosstab(analysis_data['ETHNIC_GROUP'], analysis_data['Cancer Stage'])

def standardized_residuals(observed):
    chi2, p, dof, expected = stats.chi2_contingency(observed)
    residuals = (observed - expected) / np.sqrt(expected)
    return residuals

std_residuals = standardized_residuals(stage_contingency)
std_residuals_df = pd.DataFrame(
    std_residuals,
    index=stage_contingency.index,
    columns=stage_contingency.columns
)
std_residuals_df

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap( std_residuals_df, annot=True, cmap='RdBu_r', center=0, fmt='.3f' )
plt.title('Standardized Residuals: Ethnicity vs. Cancer Stage', fontsize=16)
plt.xlabel('Cancer Stage', fontsize=14)
plt.ylabel('Ethnic Group', fontsize=14)
plt.tight_layout()
# plt.savefig('stage_residuals_heatmap.png', dpi=300)
plt.show()


### Conclusion
- Men of the **Mixed** race are diagnosed at an advanced stage (**Stage 4**) than men of any other race.

- More **Black** are diagnosed at an advanced stage (**Stage 4**) compared to **White** and **Asian** races.


- More men of the **Black** race are diagnosed at advanced stage 3 than any other race, followed by men of the **Other** race.


- There is no statistical significance between ethnicity and age of diagnosis.

  
- There is no statistical significance between ethnicity and gleason score at the time of diagnosis.


## Next Analysis 

- GRADE - tumour grades
- PSA
- HER2 status of the tumour (human epidermal growth factor receptor 2 - this factor facilitates growthe of cancer).
- laterality
- BEHAVIOUR_ICD10_O2 

In [None]:
tumours['GRADE'].value_counts()

In [None]:
# whichif the grades is worse than the other?
grades_mapper = {
    "G": "Borderline malignancy",
    "G1": "Well differentiated",
    "G2": "Moderately differentiated",
    "G3": "Poorly differentiated",
    "G4": "Undifferentiated / Anaplastic",
    "GH": "High",
    "GI": "Intermediate",
    "GL": "Low",
    "GX": "Grade cannot be assessed"
}


In [None]:
combined_data['GRADE_DESC'] = combined_data['GRADE'].map(grades_mapper)

In [None]:
combined_data['GRADE_DESC'].value_counts()

In [None]:
grade_percentages = pd.crosstab(
    combined_data['ETHNIC_GROUP'], 
    combined_data['GRADE_DESC'], 
    normalize='index'
) * 100


plt.figure(figsize=(12, 8))
grade_percentages.plot(
    kind='bar', 
    stacked=True,
    colormap='viridis',    
)

plt.title('Distribution of Prostate Cancer Grades by Ethnicity')
plt.xlabel('Ethnic Group')
plt.ylabel('Percentage')
plt.legend(title='Grade', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# plt.savefig("distributionOfGradesByEthnicity.png")
plt.show()


In [None]:
grade_counts = pd.crosstab(combined_data['ETHNIC_GROUP'], combined_data['GRADE_DESC'])

chi2, p_value, dof, expected = chi2_contingency(grade_counts)
print(f"\nChi-square test p-value: {p_value:.4f}")

In [None]:
# whic of the leterality is a good/bad indicator than the ohter
laterality_map = {
    'B': 'Bilateral',
    'L': 'Left-Sided',
    'R': 'Right-Sided',
    'M': 'Midline'
}
combined_data['LATERALITY_DESC'] = combined_data['LATERALITY'].map(laterality_map)


combined_data['LATERALITY_DESC'].value_counts(dropna=False) # alot of the data is missing could it be that medics are failing to record an importan aspect?


In [None]:
lat_data = combined_data.dropna(subset=['LATERALITY_DESC']).copy() 

lat_data['LATERALITY_DESC'].value_counts()

In [None]:
# visualize laterality against ethnic group
dist_plot = pd.crosstab(
    lat_data['ETHNIC_GROUP'],
    lat_data['LATERALITY_DESC'],
    normalize='index'
)

ax = dist_plot.plot(
    kind='bar',
    stacked=True,
    figsize=(12, 7),
    colormap='Accent', 
    edgecolor='white'
)

plt.title('Distribution of Tumor Laterality by Ethnicity', fontsize=16)
plt.ylabel('Proportion of Tumors', fontsize=12)
plt.xlabel('Ethnic Group', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Laterality', bbox_to_anchor=(1.02, 1))
plt.tight_layout()
plt.show()

contingency_table = pd.crosstab(lat_data['ETHNIC_GROUP'], lat_data['LATERALITY_DESC'])
chi2, p_value, _, _ = chi2_contingency(contingency_table)

print(f"\n== Chi-Square Test for Overall Laterality vs. Ethnicity ==")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Conclusion: There is a statistically significant association between ethnicity and the distribution of tumor laterality.")
else:
    print("Conclusion: No significant association found.")

In [None]:

lat_data['DISEASE_EXTENT'] = np.where(lat_data['LATERALITY_DESC'] == 'Bilateral', 'Bilateral', 'Unilateral/Localized')


bilateral_prop = lat_data.groupby('ETHNIC_GROUP')['DISEASE_EXTENT'].apply(lambda x: (x == 'Bilateral').mean()).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(
    x=bilateral_prop.index,
    y=bilateral_prop.values,
    palette='Reds_r' # Use a color scale that implies severity
)
plt.title('Proportion of Patients with Bilateral Prostate Cancer by Ethnicity', fontsize=16)
plt.ylabel('Proportion with Bilateral Disease', fontsize=12)
plt.xlabel('Ethnic Group', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
contingency_table_focused = pd.crosstab(lat_data['ETHNIC_GROUP'], lat_data['DISEASE_EXTENT'])
chi2_focused, p_value_focused, _, _ = chi2_contingency(contingency_table_focused)

contingency_table_focused

In [None]:

if p_value_focused < 0.05:
    print("Conclusion: There is a statistically significant difference in the proportion of bilateral disease among ethnic groups.")
else:
    print("Conclusion: No significant difference found in the proportion of bilateral disease.")