In [1]:
import shutil
import os
import numpy as np
import pandas as pd
import json
from collections import OrderedDict
from sklearn.metrics import accuracy_score, f1_score

In [2]:
home_dir = '/g100/home/userexternal/mhabibi0/'
work_dir = '/g100_work/IscrC_mental'

hdata_dir = os.path.join(home_dir, 'Data')
wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')

In [3]:
# read test data
path  = os.path.join(uc_dir, 'data_for_models_test.pkl')
df = pd.read_pickle(path)
df['male'] = df['is_male'].astype(int)

# Discretize the 'age' column into four classes
age_intervals = [0, 19, 30, 40, 100]
age_labels = [0, 1, 2, 3]
df['age_class'] = pd.cut(df['age'], bins=age_intervals, 
                         labels=age_labels, right=False)

df = df[['user_id', 'male','age_class']]

In [4]:
# M3 results bio + images
path = os.path.join(uc_dir, 'M3_results_test_bio_image.json')
with open(path, 'r') as f:
    data_bio_image = json.load(f)
    
    
# M3 results bio + images
path = os.path.join(uc_dir, 'M3_results_test_bio_only.json')
with open(path, 'r') as f:
    data_bio_only = json.load(f)

In [5]:
def json_to_df(data_json):
    rows = []
    
    for user_id, data in data_json.items():
        row = {}
        row['user_id'] = user_id
        row['score_female'] = data['gender'].get('female', 0)
        row['score_male'] = data['gender'].get('male', 0)
        
        age_classes = list(data['age'].values())
        for idx, age_value in enumerate(age_classes):
            row[f'score_age_cls_{idx}'] = age_value
        
        rows.append(row)
    
    df = pd.DataFrame(rows)
    
    df['user_id'] = df['user_id'].astype(int)
    # Add pred_male column
    df['pred_male'] = (df['score_male'] > 0.5).astype(int)
    
    # Add pred_age column
    age_cols = [f'score_age_cls_{i}' for i in range(4)]  # Assuming 4 age classes
    df['pred_age'] = df[age_cols].idxmax(axis=1).str.replace('score_age_cls_', '').astype(int)
    
    
    return df

In [6]:
df_img_bio = json_to_df(data_bio_image)
df_bio_only = json_to_df(data_bio_only)

In [7]:
# merge with ground labels
df_res_img_bio = df.merge(df_img_bio[['user_id', 'pred_male', 'pred_age']], on='user_id')
df_res_bio_only = df.merge(df_bio_only[['user_id', 'pred_male', 'pred_age']], on='user_id')

In [8]:
def compute_metrics_gender(ground_labels, pred_labels):

    accuracy = accuracy_score(ground_labels, pred_labels)
    f1 = f1_score(ground_labels, pred_labels)

    metrics = {'accuracy': accuracy,  'f1': f1 }

    return metrics

In [9]:
def compute_metrics_age(ground_labels, pred_labels):

    accuracy = accuracy_score(ground_labels, pred_labels)
    macro_f1 = f1_score(ground_labels, pred_labels, average='macro')

    metrics = {
        'accuracy': accuracy,
        'f1': macro_f1
    }

    return metrics

In [10]:
df_res_img_bio

Unnamed: 0,user_id,male,age_class,pred_male,pred_age
0,7071362,1,2,1,3
1,9420092,0,1,0,2
2,11749412,0,1,1,2
3,14088579,0,2,0,3
4,14281831,1,3,1,1
...,...,...,...,...,...
1102,1492598652535197706,1,1,1,3
1103,1500578379036246016,0,2,0,2
1104,1511727485737648132,1,3,1,3
1105,1578278161598521347,0,0,0,0


In [11]:
# metrics bio+image gender
ground = df_res_img_bio['male'].values
preds = df_res_img_bio['pred_male'].values
compute_metrics_gender(ground, preds)

{'accuracy': 0.8446251129177959, 'f1': 0.8900255754475703}

In [12]:
# metrics bio+image age
ground = df_res_img_bio['age_class'].values
preds = df_res_img_bio['pred_age'].values
compute_metrics_age(ground, preds)

{'accuracy': 0.4995483288166215, 'f1': 0.3885457107003571}

In [13]:
# metrics bio only gender
ground = df_res_bio_only['male'].values
preds = df_res_bio_only['pred_male'].values
compute_metrics_gender(ground, preds)

{'accuracy': 0.6705202312138728, 'f1': 0.7764705882352941}

In [14]:
# metrics bio+image age
ground = df_res_bio_only['age_class'].values
preds = df_res_bio_only['pred_age'].values
compute_metrics_age(ground, preds)

{'accuracy': 0.2504816955684008, 'f1': 0.24911210202130918}

In [15]:
# save m3 data 
path = os.path.join(uc_dir,  'm3_scores_bio_image.pkl')
df_img_bio.to_pickle(path)


path = os.path.join(uc_dir, 'm3_scores_bio_only.pkl')
df_bio_only.to_pickle(path)