In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

## FGNet-LOPO -- Hierarchical Model

In [13]:
df_fgnet = pd.read_csv("data/FGNet-LOPO.csv")
df_fgnet["ageclass"] = df_fgnet.age.apply(
    lambda r: 0 if r < 18 else 1
).astype(int)

print(df_fgnet.columns)
print(f' <18: {df_fgnet[df_fgnet.age < 18].shape[0]}\n>=18: {df_fgnet[df_fgnet.age >= 18].shape[0]}')
print(df_fgnet[["ageclass"]].value_counts())
df_fgnet[["age", "ageclass"]].sample(20)

Index(['b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9', 'b10',
       ...
       'b104', 'b105', 'b106', 'b107', 'b108', 'b109', 'age', 'ID',
       'Gender_0M_1F', 'ageclass'],
      dtype='object', length=113)
 <18: 640
>=18: 362
ageclass
0           640
1           362
dtype: int64


Unnamed: 0,age,ageclass
57,30,1
920,1,0
573,13,0
425,14,0
103,1,0
439,15,0
867,8,0
455,19,1
309,34,1
604,12,0


In [46]:
def df_shape_table(*df_dicts):
    title_str = f"{'DataFrame':>15} | {'Shape':15}"
    print(title_str)
    for df_dict in df_dicts:
        print(f"{''.join(['-'] * 25):^33}")
        for name, df in df_dict.items():
            print(f"{name:>15} | { str(df.shape) :<15}")

In [47]:
df_yng = df_fgnet.iloc[df_fgnet[df_fgnet.ageclass == 0].index, :]
df_old = df_fgnet.iloc[df_fgnet[df_fgnet.ageclass == 1].index, :]

X = df_fgnet.drop(["age", "ID", "Gender_0M_1F", "ageclass"], axis=1)
X_yng = df_yng.drop(["age", "ID", "Gender_0M_1F", "ageclass"], axis=1)
X_old = df_old.drop(["age", "ID", "Gender_0M_1F", "ageclass"], axis=1)

y = df_fgnet[["ID", "age", "ageclass", "Gender_0M_1F"]]
y_yng = df_yng[["ID", "age", "ageclass", "Gender_0M_1F"]]
y_old = df_old[["ID", "age", "ageclass", "Gender_0M_1F"]]

df_shape_table({'X': X, 'X Young': X_yng, 'X Old': X_old},
               {'y': y, 'y Young': y_yng, 'y Old': y_old})

      DataFrame | Shape          
    -------------------------    
              X | (1002, 109)    
        X Young | (640, 109)     
          X Old | (362, 109)     
    -------------------------    
              y | (1002, 4)      
        y Young | (640, 4)       
          y Old | (362, 4)       


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
X_yng_train, X_yng_test, y_yng_train, y_yng_test = train_test_split(X_yng, y_yng, test_size=.2)
X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(X_old, y_old, test_size=.2)

df_shape_table(
    {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test},
    {'X_yng_train': X_yng_train, 'X_yng_test': X_yng_test, 'y_yng_train': y_yng_train, 'y_yng_test': y_yng_test},
    {'X_old_train': X_old_train, 'X_old_test': X_old_test, 'y_old_train': y_old_train, 'y_old_test': y_old_test}
)

      DataFrame | Shape          
    -------------------------    
        X_train | (801, 109)     
         X_test | (201, 109)     
        y_train | (801, 4)       
         y_test | (201, 4)       
    -------------------------    
    X_yng_train | (512, 109)     
     X_yng_test | (128, 109)     
    y_yng_train | (512, 4)       
     y_yng_test | (128, 4)       
    -------------------------    
    X_old_train | (289, 109)     
     X_old_test | (73, 109)      
    y_old_train | (289, 4)       
     y_old_test | (73, 4)        


In [58]:
y_test.head()

Unnamed: 0,ID,age,ageclass,Gender_0M_1F
77,6,69,1,0
647,53,12,0,0
37,3,47,1,1
391,32,35,1,1
973,80,8,0,0


In [62]:
model_metrics = {}

for i in range(10):
    model_metrics[i] = {
        "clf_true": [],
        "clf_pred": [],
        "reg_true": [],
        "reg_pred": [],
        "total": 0,
        "score": 0,
        "error": 0.
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
    X_yng_train, X_yng_test, y_yng_train, y_yng_test = train_test_split(X_yng, y_yng, test_size=.2)
    X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(X_old, y_old, test_size=.2)
    
    svm = SVC(C=1000, gamma=.01, kernel='rbf').fit(X_train, y_train["ageclass"])
    reg_yng = Ridge(alpha=.1).fit(X_yng_train, y_yng_train["age"])
    reg_old = Ridge(alpha=.05).fit(X_old_train, y_old_train["age"])
    
    for idx in X_test.index:
        row = X_test.loc[idx].values.reshape(1, -1)
        
        age_true = y_test.loc[idx, "age"]
        ageclass_true = y_test.loc[idx, "ageclass"]
        ageclass_pred = svm.predict(row)
        
        if ageclass_pred not in [0, 1]:
            raise ValueError("ageclass must be either 0 or 1")
            
        age_pred = reg_yng.predict(row) if ageclass_pred == 0 else reg_old.predict(row)
        
        model_metrics[i]["error"] += abs(age_pred - age_true)
        model_metrics[i]["score"] += 1 if np.round(age_pred) == age_true else 0
        model_metrics[i]["total"] += 1
        model_metrics[i]["clf_true"].append(ageclass_true)
        model_metrics[i]["clf_pred"].append(ageclass_pred)
        model_metrics[i]["reg_true"].append(age_true)
        model_metrics[i]["reg_pred"].append(age_pred)
        
    print(model_metrics[i]["error"] / model_metrics[i]["total"])

[3.47152899]
[3.77524155]
[3.8475331]
[3.81995785]
[3.36518051]
[3.2301345]
[3.81424444]
[3.66751503]
[3.77747745]
[3.79034929]


### Classification/Regression Metrics

In [82]:
from sklearn import metrics as m


def calculate_metrics(model_metrics):
    clf_metrics = {
        'Accuracy': [],
        'Precision': [],
        'Recall': [],
        'F1 Score': [],
        'Jaccard': [],
    }

    reg_metrics = {
        "MAE": [],
        "MSE": [],
        "RMSE": [],
        "R2": [],
        "MAPE": [],
        "Median AE": [],
        "Max Error": [],
    }

    for epoch, metrics in model_metrics.items():
        clf_true, clf_pred = metrics["clf_true"], metrics["clf_pred"]
        reg_true, reg_pred = metrics["reg_true"], metrics["reg_pred"]
        
        clf_metrics["Accuracy"].append(m.accuracy_score(clf_true, clf_pred))
        clf_metrics["Precision"].append(m.precision_score(clf_true, clf_pred))
        clf_metrics["Recall"].append(m.recall_score(clf_true, clf_pred))
        clf_metrics["F1 Score"].append(m.f1_score(clf_true, clf_pred))
        clf_metrics["Jaccard"].append(m.jaccard_score(clf_true, clf_pred))
        
        reg_metrics["MAE"].append(m.mean_absolute_error(reg_true, reg_pred))
        reg_metrics["MSE"].append(m.mean_squared_error(reg_true, reg_pred))
        reg_metrics["RMSE"].append(m.mean_squared_error(reg_true, reg_pred, squared=False))
        reg_metrics["R2"].append(m.r2_score(reg_true, reg_pred))
        reg_metrics["MAPE"].append(m.mean_absolute_percentage_error(reg_true, reg_pred))
        reg_metrics["Median AE"].append(m.median_absolute_error(reg_true, reg_pred))
        reg_metrics["Max Error"].append(m.max_error(reg_true, reg_pred))
        
    clf_metrics = pd.DataFrame(clf_metrics).round(4)
    reg_metrics = pd.DataFrame(reg_metrics).round(4)
    
    return clf_metrics, reg_metrics


clf_metrics, reg_metrics = calculate_metrics(model_metrics)

In [83]:
reg_metrics

Unnamed: 0,MAE,MSE,RMSE,R2,MAPE,Median AE,Max Error
0,3.4715,22.1504,4.7064,0.8765,498908600000000.0,2.505,16.0771
1,3.7752,45.365,6.7354,0.7734,361967700000000.0,2.2793,48.182
2,3.8475,42.1088,6.4891,0.7348,291316200000000.0,2.0616,37.6786
3,3.82,33.9145,5.8236,0.7874,234158900000000.0,2.1198,24.7566
4,3.3652,26.4973,5.1476,0.8341,463566700000000.0,1.7782,22.9836
5,3.2301,24.3302,4.9326,0.8488,257777000000000.0,1.8626,24.0144
6,3.8142,40.7614,6.3845,0.7392,419006400000000.0,2.0389,32.7234
7,3.6675,29.7576,5.4551,0.7927,414551400000000.0,2.2276,27.4794
8,3.7775,40.312,6.3492,0.7865,479141500000000.0,2.1473,43.282
9,3.7903,34.3981,5.865,0.771,443590000000000.0,2.2209,28.1663


In [84]:
clf_metrics

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Jaccard
0,0.8856,0.875,0.8182,0.8456,0.7326
1,0.8607,0.7821,0.8472,0.8133,0.6854
2,0.8756,0.8667,0.8125,0.8387,0.7222
3,0.8557,0.775,0.8493,0.8105,0.6813
4,0.8905,0.8312,0.8767,0.8533,0.7442
5,0.9154,0.9118,0.8493,0.8794,0.7848
6,0.8706,0.806,0.806,0.806,0.675
7,0.8905,0.8472,0.8472,0.8472,0.7349
8,0.8905,0.8507,0.8261,0.8382,0.7215
9,0.8259,0.7381,0.8267,0.7799,0.6392


## FGNet-LOPO -- Hierarchical Model w/ LOPOCV

In [86]:
from pyfgnet.crossval import LOPOCV

In [None]:
cv = LOPOCV(shuffle=True)
model_metrics = {}

for i in range(10):
    model_metrics[i] = {
        "clf_true": [],
        "clf_pred": [],
        "reg_true": [],
        "reg_pred": [],
        "total": 0,
        "score": 0,
        "error": 0.
    }
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
    X_yng_train, X_yng_test, y_yng_train, y_yng_test = train_test_split(X_yng, y_yng, test_size=.2)
    X_old_train, X_old_test, y_old_train, y_old_test = train_test_split(X_old, y_old, test_size=.2)
    
    svm = SVC(C=1000, gamma=.01, kernel='rbf').fit(X_train, y_train["ageclass"])
    reg_yng = Ridge(alpha=.1).fit(X_yng_train, y_yng_train["age"])
    reg_old = Ridge(alpha=.05).fit(X_old_train, y_old_train["age"])
    
    for idx in X_test.index:
        row = X_test.loc[idx].values.reshape(1, -1)
        
        age_true = y_test.loc[idx, "age"]
        ageclass_true = y_test.loc[idx, "ageclass"]
        ageclass_pred = svm.predict(row)
        
        if ageclass_pred not in [0, 1]:
            raise ValueError("ageclass must be either 0 or 1")
            
        age_pred = reg_yng.predict(row) if ageclass_pred == 0 else reg_old.predict(row)
        
        model_metrics[i]["error"] += abs(age_pred - age_true)
        model_metrics[i]["score"] += 1 if np.round(age_pred) == age_true else 0
        model_metrics[i]["total"] += 1
        model_metrics[i]["clf_true"].append(ageclass_true)
        model_metrics[i]["clf_pred"].append(ageclass_pred)
        model_metrics[i]["reg_true"].append(age_true)
        model_metrics[i]["reg_pred"].append(age_pred)
        
    print(model_metrics[i]["error"] / model_metrics[i]["total"])