# Smoking is bad

In [None]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df = pl.read_csv("data/smoking_data.csv")
df.head()

In [None]:
df.shape

In [None]:
df.describe()  # we do not really get that much of an insight for our string columns

In [None]:
df.null_count()  # salary column has a lot of missings; maybe we should drop it

In [None]:
# clean data frame and make easier to work with
df_clean = (df.with_columns(
    pl.when(pl.col("gender") == "female").then(0).otherwise(1).alias("gender_num"),
    pl.when(pl.col("smoker") == "No").then(0).otherwise(1).alias("smoker_num"),
    pl.when(pl.col("outcome") == "Dead").then(0).otherwise(1).alias("outcome_num"),
))

df_clean.head()

In [None]:
df_clean.select(["outcome_num", "age", "gender_num", "smoker_num"]).describe()

## Smoking analysis
Let's first check what the cousin did.

In [None]:
df_clean.group_by("smoker_num").agg(prob_alive=pl.col("outcome_num").mean())

Okay, so we get the same results.

In [None]:
df_clean.group_by(["smoker_num", "gender_num"]).agg(prob_alive=pl.col("outcome_num").mean()).sort("gender_num")

Seems like the same still holds; The probability to be alive given smoking is higher for both, male and female.

Of course, we assume that more young ppl. smoke than old ppl. and that young ppl. are more likely to be alive.
So maybe we should look at the age distribution of the smokers.

In [None]:
sns.histplot(df_clean, x="age")

In [None]:
# let's create age intervals
df_clean = df_clean.with_columns(
    pl.when(pl.col("age") < 30).then(30)
    .when(pl.col("age") < 40).then(40)
    .when(pl.col("age") < 50).then(50)
    .when(pl.col("age") < 60).then(60)
    .when(pl.col("age") < 70).then(70).otherwise(100).alias("age_interval")
)
df_clean.head()

In [None]:
df_grouped = df_clean.group_by("age_interval").agg(
    prob_alive=pl.col("outcome_num").mean(),
    prob_smoker=pl.col("smoker_num").mean(),
    group_count=pl.len(),
    ).sort("age_interval")

In [None]:
sns.lineplot(data=df_grouped, x="age_interval", y="prob_alive")
sns.lineplot(data=df_grouped, x="age_interval", y="prob_smoker")
# Customize the plot
plt.title('Probability of Survival and Smoking by Age Interval')
plt.xlabel('Age Interval')
plt.ylabel('Probability')
plt.xticks(rotation=45)  # Rotate x-labels if needed
plt.legend(loc)

# Show the plot
#plt.tight_layout()
plt.show()

So we see that age effects staying alive and smoking.

In [None]:
df_by_age = df_clean.group_by(["age_interval", "smoker_num"]).agg(prob_alive=pl.col("outcome_num").mean())
df_by_age

In [None]:
sns.lineplot(data=df_by_age, x="age_interval", y="prob_alive", hue="smoker_num")

So it seems that when you are young, your probability to be alive is higher when you do not smoke, and there is a point when this turns around.

In [None]:
# let's make more age brackets
df_clean = df_clean.with_columns(age_int_2=pl.col("age").map_elements(lambda s: np.round(s/10) * 10))

In [None]:
df_by_age = df_clean.group_by(["age_int_2", "smoker_num"]).agg(prob_alive=pl.col("outcome_num").mean())
sns.lineplot(data=df_by_age, x="age_int_2", y="prob_alive", hue="smoker_num")