In [16]:
import kagglehub
import pandas as pd

def age_to_agegroup(age):
    if age > 100:
        return 0;
    elif age > 90:
        return 1;
    elif age > 80:
        return 2;
    elif age > 70:
        return 3;    
    elif age > 60:
        return 4;
    elif age > 50:
        return 5;
    elif age > 40:
        return 6; 
    else:
        return 7;

def read_and_normalize_data():
    # Download most recent version of the individual-age-of-death-and-related-factors dataset from kaggle
    path = kagglehub.dataset_download("joannpineda/individual-age-of-death-and-related-factors")
    file = path + "/data.json";

    print("Data set is downloaded from kaggle-hub as:", file) 

    # Read data into panda
    kg = pd.read_json(file);

    # Convert pounds to kg and inches to centimeters
    print("Converting pounds to kg and inches to centimeters..") 
    kg.weight = kg.weight * 0.45359237;
    kg.height = kg.height * 2.54;
    
    # String values like 'f'/'m' (for female/male) and 'y'/'n' (for yes/no) to int8
    print("Converting chars to int..");
    kg.sex = kg.sex.astype('category').cat.codes;
    kg.smoker = kg.smoker.astype('category').cat.codes;
    kg.nic_other = kg.nic_other.astype('category').cat.codes;
    kg.occup_danger = kg.occup_danger.astype('category').cat.codes;
    kg.ls_danger = kg.ls_danger.astype('category').cat.codes;
    kg.cannabis = kg.cannabis.astype('category').cat.codes;
    kg.opioids = kg.opioids.astype('category').cat.codes;
    kg.other_drugs = kg.other_drugs.astype('category').cat.codes;
    kg.addiction = kg.addiction.astype('category').cat.codes;
    kg.diabetes = kg.diabetes.astype('category').cat.codes;
    kg.hds = kg.hds.astype('category').cat.codes;
    kg.asthma = kg.asthma.astype('category').cat.codes;
    kg.immune_defic = kg.immune_defic.astype('category').cat.codes;
    kg.family_cancer = kg.family_cancer.astype('category').cat.codes;
    kg.family_heart_disease = kg.family_heart_disease.astype('category').cat.codes;
    kg.family_cholesterol = kg.family_cholesterol.astype('category').cat.codes;

    # Round age to multiple of 5
    print("Rounding age up to 5 years");
    kg["age"] = kg.age.map(age_to_agegroup);

    return kg

In [17]:
# Read data
df = read_and_normalize_data();
df

Data set is downloaded from kaggle-hub as: /Users/pablomassizzo/.cache/kagglehub/datasets/joannpineda/individual-age-of-death-and-related-factors/versions/2/data.json
Converting pounds to kg and inches to centimeters..
Converting chars to int..
Rounding age up to 5 years


Unnamed: 0,age,weight,sex,height,sys_bp,smoker,nic_other,num_meds,occup_danger,ls_danger,...,addiction,major_surgery_num,diabetes,hds,cholesterol,asthma,immune_defic,family_cancer,family_heart_disease,family_cholesterol
0,1,99.336729,1,187.96,136,0,0,0,0,0,...,0,0,0,1,203,0,0,1,0,1
1,4,109.769354,1,185.42,111,0,0,0,0,0,...,1,0,0,0,228,0,0,0,0,0
2,7,89.357697,0,165.10,112,0,0,7,0,1,...,1,3,0,1,183,0,0,0,0,0
3,6,110.676538,0,175.26,127,0,0,1,1,2,...,0,2,0,1,228,0,0,0,0,0
4,1,83.007404,0,160.02,91,1,0,2,2,2,...,1,2,0,0,169,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,100.697506,0,170.18,120,0,0,0,1,0,...,0,3,0,0,207,0,0,0,0,0
9996,7,128.820233,0,175.26,138,1,0,0,2,1,...,1,3,0,1,270,0,0,0,1,0
9997,6,93.893621,0,152.40,127,1,0,0,2,2,...,1,5,1,1,194,0,0,1,1,1
9998,2,84.368181,0,167.64,137,0,0,4,0,2,...,0,2,0,0,170,0,0,0,0,0


In [18]:
for i, col in enumerate(df.columns):
    print(i, col)

0 age
1 weight
2 sex
3 height
4 sys_bp
5 smoker
6 nic_other
7 num_meds
8 occup_danger
9 ls_danger
10 cannabis
11 opioids
12 other_drugs
13 drinks_aweek
14 addiction
15 major_surgery_num
16 diabetes
17 hds
18 cholesterol
19 asthma
20 immune_defic
21 family_cancer
22 family_heart_disease
23 family_cholesterol


In [19]:
df.describe()

Unnamed: 0,age,weight,sex,height,sys_bp,smoker,nic_other,num_meds,occup_danger,ls_danger,...,addiction,major_surgery_num,diabetes,hds,cholesterol,asthma,immune_defic,family_cancer,family_heart_disease,family_cholesterol
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.0453,97.397848,0.4966,170.782234,126.4824,0.2006,0.2025,4.5905,0.9968,1.0056,...,0.3252,4.1709,0.1072,0.4652,199.7361,0.0782,0.0312,0.3513,0.1269,0.184
std,2.184382,16.159223,0.500013,9.898704,14.279162,0.400469,0.401883,5.512372,0.819425,0.8166,...,0.468473,2.964013,0.309383,0.498812,35.633212,0.268499,0.173867,0.4774,0.332878,0.387504
min,0.0,43.99846,0.0,134.62,67.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,83.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,86.18255,0.0,162.56,117.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,175.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,97.068767,0.0,170.18,126.0,0.0,0.0,3.0,1.0,1.0,...,0.0,4.0,0.0,0.0,199.0,0.0,0.0,0.0,0.0,0.0
75%,6.0,107.954984,1.0,177.8,136.0,0.0,0.0,7.0,2.0,2.0,...,1.0,6.0,0.0,1.0,223.0,0.0,0.0,1.0,0.0,0.0
max,7.0,166.014807,1.0,208.28,180.0,1.0,1.0,53.0,2.0,2.0,...,1.0,16.0,1.0,1.0,351.0,1.0,1.0,1.0,1.0,1.0


In [20]:
# shuffle rows
df = df.sample(frac=1)

# split into train and test (we can split the data the simple way because we know the set is shuffled)
train = df[:8000];
test  = df[8000:];

# use all attributes as input, except age (age has index 0)
train_x = train.iloc[:, 1:23]; 
test_x = test.iloc[:, 1:23];

# use age as target
train_y = train.age;
test_y = test.age;

In [21]:
# show training data
train_x

Unnamed: 0,weight,sex,height,sys_bp,smoker,nic_other,num_meds,occup_danger,ls_danger,cannabis,...,drinks_aweek,addiction,major_surgery_num,diabetes,hds,cholesterol,asthma,immune_defic,family_cancer,family_heart_disease
6211,96.615175,1,182.88,141,0,0,0,1,0,0,...,2,0,0,1,1,198,0,0,0,0
6511,101.151099,0,154.94,120,1,1,1,0,1,0,...,11,0,5,0,0,208,0,0,0,0
8925,88.904105,1,185.42,145,0,0,0,0,0,0,...,7,0,0,1,0,182,0,0,0,0
5958,102.511876,0,162.56,127,0,0,5,1,1,0,...,8,0,3,0,1,211,0,0,0,0
5940,92.532843,0,162.56,141,0,1,19,2,1,0,...,17,1,11,0,0,189,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,109.315761,1,177.80,145,0,0,4,2,1,0,...,16,0,4,0,1,226,0,1,0,0
4152,95.254398,1,167.64,118,0,0,1,0,2,0,...,4,1,8,0,0,195,0,0,1,0
3559,114.758870,1,177.80,142,0,1,10,1,2,0,...,12,1,10,0,1,236,0,0,0,0
590,78.925072,0,175.26,124,0,0,2,0,2,0,...,12,0,4,0,1,158,0,0,1,0


In [22]:
# show training target
train_y

6211    0
6511    2
8925    2
5958    3
5940    7
       ..
1013    6
4152    3
3559    5
590     3
9382    7
Name: age, Length: 8000, dtype: int64

In [23]:
from sklearn.neighbors import KNeighborsClassifier as KNN;
from sklearn.metrics import accuracy_score

# Train K-Nearest-Neighbor
tree = KNN(n_neighbors=50);
tree.fit(train_x, train_y);

# Test accuracy against test-sset
knn_accuracy = accuracy_score(tree.predict(test_x), test_y);
print("Accuracy of KNN with", tree.n_neighbors, "n_neighbours is", knn_accuracy );

Accuracy of KNN with 50 n_neighbours is 0.248


In [24]:
from sklearn.svm import SVC

# Train Support Vector Machine
svc = SVC()
svc.fit(train_x, train_y)

# Test accuracy against test-set
svc_accuracy = accuracy_score(svc.predict(test_x), test_y)
print("Accuracy of SVC is", svc_accuracy );

Accuracy of SVC is 0.228


In [1]:
import matplotlib.pyplot as plt

# Drinking, cholestorol and weigth show a negative correlation with life expectancy
plt.scatter(df.cholesterol, df.age, s=50, alpha=0.005, color='red');
plt.scatter(df.weight*2, df.age, s=50, alpha=0.005, color='blue');
plt.scatter(100 + df.drinks_aweek*10, df.age, s=50, alpha=0.005, color='green');

plt.show()

NameError: name 'df' is not defined

In [13]:
# df.insert(1, "bmi", 703*(df.weight/pow(kg.height,2)), True);
# df.insert(1, "eater", (df.bmi>30).astype(int), True);
# df.insert(1, "drinker", (df.drinks_aweek>20).astype(int), True);
# df.insert(1, "smoker", df.smoker.astype('category').cat.codes, True);
# df.insert(1, "sex", (df.sex == 'f').astype(int), True);
# df.insert(1, "age_group", df.age_group, True);

# cols = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28]
# df.drop(df.columns[cols], axis=1, inplace=True)

In [15]:
from mlxtend.plotting import plot_decision_regions
from sklearn.decomposition import PCA

# Convert to numpy arrays
numpy_x = train_x.to_numpy()
numpy_y = train_y.age.to_numpy()

# Transform into 2 dimentions so we can plot the regions
pca = PCA(n_components = 2)
numpy_x_2d = pca.fit_transform(numpy_x)

# Build and train KNN classifier 
tree = KNN(n_neighbors=2)
tree.fit(numpy_x_2d, numpy_y)

# Plot regions for first 50 
plot_decision_regions(numpy_x_2d[:50], numpy_y[:50], clf=tree);


AttributeError: 'Series' object has no attribute 'age'

In [None]:
import numpy as np
import sklearn 
# %matplotlib inline