Let's practice loading and exploring some data! ðŸ¤“

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("https://github.com/mbrudd/csci290/raw/main/data/stroke.csv")

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["stroke"].unique()

In [None]:
df['ever_married'].unique()

In [None]:
df['Residence_type'].unique()

In [None]:
df["stroke"].value_counts()

In [None]:
df["stroke"].value_counts( normalize=True )

In [None]:
df["stroke"].mean()

In [None]:
df.plot(x="age", y="stroke", kind="scatter")

In [None]:
df["age"].describe()

In [None]:
df["age_cat"] = pd.cut( df["age"], bins = [0,25,45,61,100], labels = ["Young","Adult","Middle-aged","Old"] )

In [None]:
df["age_cat"].value_counts()

In [None]:
df.groupby("age_cat", observed=True)["stroke"].mean()

In [None]:
df.groupby("heart_disease")["stroke"].mean()

In [None]:
df.groupby(["heart_disease","hypertension", "ever_married"])["stroke"].mean()

# Build and assess a simple classifier

In [None]:
df["prediction"] = df["age"] > 25

In [None]:
# import sys
# !{sys.executable} -m pip install scikit-learn

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
cm = confusion_matrix( df["stroke"], df["prediction"])
cm

In [None]:
ConfusionMatrixDisplay( cm, display_labels=["no stroke","stroke"]).plot()

In [None]:
tn, fp, fn, tp = confusion_matrix( df["stroke"], df["prediction"]).ravel()
tn, fp, fn, tp

In [None]:
import sklearn.metrics as metrics

In [None]:
metrics.recall_score( df["stroke"], df["prediction"])

In [None]:
tp / (tp + fn)

In [None]:
metrics.precision_score( df["stroke"], df["prediction"])

In [None]:
tp / (tp + fp)

In [None]:
def precision_vs_recall( threshold ):
  df["prediction"] = df["age"] > threshold
  return metrics.precision_score( df["stroke"], df["prediction"]), metrics.recall_score( df["stroke"], df["prediction"])

In [None]:
precision_vs_recall( 61 )

In [None]:
precisions = []
recalls = []
thresholds = range(0,82)
for threshold in thresholds:
  precision, recall = precision_vs_recall( threshold )
  precisions.append( precision )
  recalls.append( recall )

In [None]:
pr = pd.DataFrame( {
    "threshold": thresholds,
    "precision": precisions,
    "recall": recalls
})

In [None]:
pr.tail()

In [None]:
pr.plot( x="threshold", y=["precision","recall"]);

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2)

In [None]:
train_set.columns

In [None]:
train_set = train_set[["gender","age","hypertension","heart_disease","smoking_status","stroke"]]

In [None]:
test_set = test_set[["gender","age","hypertension","heart_disease","smoking_status","stroke"]]

In [None]:
X_train = train_set[["gender","age","hypertension","heart_disease","smoking_status"]]
y_train = train_set["stroke"]

In [None]:
X_test = test_set[["gender","age","hypertension","heart_disease","smoking_status"]]
y_test = test_set["stroke"]

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree

In [None]:
num_attributes = X_train.select_dtypes( include = ['int64','float64']).columns
cat_attributes = X_train.select_dtypes( include = ['object']).columns

In [None]:
trf = [ ('num', StandardScaler(), num_attributes),
       ('cat', OneHotEncoder( handle_unknown='ignore'), cat_attributes) ]
col_transform = ColumnTransformer( transformers = trf )

In [None]:
pipeline = Pipeline( steps = [('pre', col_transform),
 ('clf', DecisionTreeClassifier(max_depth=3))])

In [None]:
cross_val_score( pipeline, X_train, y_train, cv=5, scoring='accuracy')

In [None]:
model = pipeline.fit( X_train, y_train )
model

In [None]:
predictions = pipeline.predict( X_test )

In [None]:
cm = confusion_matrix( y_test, predictions )

In [None]:
cm

In [None]:
plot_tree( pipeline.named_steps['clf'])