In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

air_pollution_df = pd.read_csv('/mnt/data/global_air_pollution_dataset.csv')
lung_cancer_df = pd.read_csv('/mnt/data/lung_cancer_prediction_dataset.csv')

# Part 3 - 1
plt.figure(figsize=(6, 4))
sns.boxplot(y=lung_cancer_df['Cancer Diagnosis'])
plt.title('Boxplot of Lung Cancer Deaths Distribution')
plt.ylabel('Lung Cancer Cases')
plt.grid(True)
plt.show()

# Part 3 - 2
plt.figure(figsize=(6, 4))
plt.hist(air_pollution_df['PM2.5 AQI Value'], bins=20, color='purple', edgecolor='black', alpha=0.7)
plt.xlabel('PM2.5 AQI Value')
plt.ylabel('Frequency')
plt.title('Histogram of PM2.5 AQI Values')
plt.grid(True)
plt.show()

# Part 3 - 3
plt.figure(figsize=(6, 4))
sns.kdeplot(lung_cancer_df['Cancer Diagnosis'], fill=True, color='red')
plt.title('Density Plot of Lung Cancer Mortality Rate')
plt.xlabel('Lung Cancer Cases')
plt.ylabel('Density')
plt.grid(True)
plt.show()

# Part 3 - 4
np.random.seed(42)
normal_values = np.random.normal(size=100)
logistic_values = np.random.logistic(size=100)
plt.figure(figsize=(6, 4))
plt.scatter(normal_values, logistic_values, color='brown', alpha=0.6)
plt.title('Scatter Plot of Normal vs Logistic Distributions')
plt.xlabel('Normal Distribution')
plt.ylabel('Logistic Distribution')
plt.grid(True)
plt.show()

# Part 4 - 1
plt.figure(figsize=(6, 4))
sns.histplot(air_pollution_df['PM2.5 AQI Value'], bins=20, kde=True, color='blue', edgecolor='black', alpha=0.7)
plt.title('PM2.5 AQI Distribution with Density Overlay')
plt.xlabel('PM2.5 AQI Value')
plt.ylabel('Density')
plt.grid(True)
plt.show()

# Part 4 - 2
plt.figure(figsize=(6, 4))
sns.scatterplot(data=lung_cancer_df, x='PM2.5 AQI Value', y='Annual_Lung_Cancer_Deaths', hue='Country', palette='tab20')
plt.title('PM2.5 AQI vs. Annual Lung Cancer Deaths')
plt.xlabel('PM2.5 AQI Value')
plt.ylabel('Annual Lung Cancer Deaths')
plt.grid(True)
plt.show()

# Part 4 - 3
plt.figure(figsize=(6, 4))
sns.stripplot(data=lung_cancer_df, x='Gender', y='Years_of_Smoking', jitter=True, hue='Gender', palette={'Female': '#5469f1', 'Male': '#d554f1'}, alpha=0.6)
plt.title('Lung Cancer Stage vs. Smoking Years')
plt.xlabel('Gender')
plt.ylabel('Years of Smoking')
plt.grid(True)
plt.show()

# Part 4 - 4
g = sns.FacetGrid(air_pollution_df, col='Country', col_wrap=3, sharex=True, sharey=True)
g.map_dataframe(sns.histplot, x='PM2.5 AQI Value', bins=20, color='purple', edgecolor='black', alpha=0.7)
g.set_titles('{col_name}')
g.set_axis_labels('PM2.5 AQI Value', 'Frequency')
g.fig.suptitle('PM2.5 AQI Distribution Across Countries', y=1.02)
plt.show()

