In [22]:
import pandas as pd
import plotly.express as px
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
normalized_df = pd.read_csv("normalized_data.csv")
normalized_df_profile = ProfileReport(normalized_df, title="Kanser Verisi Profili", explorative=True)

In [3]:
normalized_df_profile.to_file("reports/part1/veri_profili.html")

100%|██████████| 15/15 [00:00<00:00, 408.37it/s]<00:00, 57.39it/s, Describe variable: rate]        
Summarize dataset: 100%|██████████| 168/168 [00:08<00:00, 18.92it/s, Completed]                                
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.30s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.14s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 138.41it/s]


In [4]:
regression_df = pd.read_csv("regression_model_comparison.csv")
print("\n📊 Phase 1: Regression Model Comparison (MAE & R²):")
print(regression_df)


📊 Phase 1: Regression Model Comparison (MAE & R²):
                             Model       MAE        R2
0  RandomForestRegressor_Incidence  0.273954  0.996024
1           XGBRegressor_Incidence  0.944223  0.982497
2  RandomForestRegressor_Mortality  0.138801  0.996760
3           XGBRegressor_Mortality  0.508095  0.978162


In [5]:
dom_cancer_df = pd.read_csv("dominant_cancer_types.csv")

In [6]:
normalized_df["pollution_smoking"] = normalized_df["air_pollution"] * normalized_df["tobacco_use"]
feature_cols = [
    "air_pollution", "tobacco_use", "alcohol_use", "obesity_rate",
    "gdp_per_capita", "uhc_index", "population", "pollution_smoking"
]
agg_features = normalized_df.groupby("country_name")[feature_cols].mean().reset_index()
merged = pd.merge(agg_features, dom_cancer_df, on="country_name", how="left")

In [7]:
dominant_cancer_df = pd.read_csv("dominant_cancer_types_all_filled_rf.csv")
dominant_cancer_df_profile = ProfileReport(dominant_cancer_df, title="Filled Dominant Cancer Types", explorative=True)

In [8]:
dominant_cancer_df_profile.to_file("reports/part2/dominant_cancer_types_all_filled_data_profile.html")

100%|██████████| 3/3 [00:00<00:00, 57719.78it/s], ?it/s, Describe variable: highest_mortality_cancer]
Summarize dataset: 100%|██████████| 12/12 [00:00<00:00, 84.97it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.70it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00, 13.51it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 827.28it/s]


In [9]:
merged = pd.merge(agg_features, dom_cancer_df, on="country_name", how="left")

In [10]:
cancer_incidence_3d = px.scatter_3d(
    merged.dropna(subset=["highest_incidence_cancer"]),
    x="air_pollution",
    y="tobacco_use",
    z="obesity_rate",
    color="highest_incidence_cancer",
    hover_name="country_name",
    title="Highest Cancer Incidence in Countries According to the Risk Factors"
)
cancer_incidence_3d.write_html("reports/part2/cancer_incidence_3d.html")

In [11]:
cancer_mortality_3d = px.scatter_3d(
    merged.dropna(subset=["highest_mortality_cancer"]),
    x="air_pollution",
    y="tobacco_use",
    z="obesity_rate",
    color="highest_mortality_cancer",
    hover_name="country_name",
    title="Highest Cancer Mortality in Countries According to the Risk Factors"
)
cancer_mortality_3d.write_html("reports/part2/cancer_mortality_3d.html")

In [12]:
counts = dom_cancer_df["highest_incidence_cancer"].value_counts(normalize=True)
top_n = 4
top_counts = counts.head(top_n)
other_ratio = 1.0 - top_counts.sum()

In [13]:
labels = list(top_counts.index) + ["Other"]
sizes = list(top_counts.values) + [other_ratio]

In [14]:
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title(f"Most Common Cancer Types (Top {top_n} + Other)")
plt.savefig("reports/part2/most_common_cancer_types.png")

In [15]:
country_map = {"Türkiye": "Turkey"}
dom_cancer_df["country_name"] = dom_cancer_df["country_name"].replace(country_map)

In [16]:
world_map_mortality_cancer = px.choropleth(dom_cancer_df,
                    locations="country_name",
                    locationmode="country names",
                    color="highest_mortality_cancer",
                    title="Most Common Cancer Types (Mortality)",
                    color_discrete_sequence=px.colors.qualitative.Bold)

In [17]:
world_map_mortality_cancer.update_geos(projection_type="natural earth")
world_map_mortality_cancer.update_layout(margin={"r":0,"t":40,"l":0,"b":0})

In [18]:
world_map_mortality_cancer.write_html("reports/part2/world_map_mortality_cancer.html")

In [19]:
world_map_incidence_cancer = px.choropleth(dom_cancer_df,
                    locations="country_name",
                    locationmode="country names",
                    color="highest_incidence_cancer",
                    title="Most Common Cancer Types (Incidence)",
                    color_discrete_sequence=px.colors.qualitative.Bold)

In [20]:
# Dünya çapında tam görünüm için:
world_map_incidence_cancer.update_geos(projection_type="natural earth")
world_map_incidence_cancer.update_layout(margin={"r":0,"t":40,"l":0,"b":0})

In [21]:
world_map_incidence_cancer.write_html("reports/part2/world_map_incidence_cancer.html")

In [23]:
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_df[feature_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Between Features")
plt.savefig("reports/part2/correlation_between_features.")