In [206]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score, f1_score, precision_score
from collections import Counter

In [207]:
#xlm_ita_age_train = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/age/XLM_probs_age_traind.npy')
#xlm_ita_gender_train = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/gender/XLM_probs_gender_train.npy')

xlm_ita_gender = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/gender/XLM_probs_gender_test.npy'))
xlm_ita_gender.columns = ['user_id', 'p_is_female', 'p_is_male']
xlm_ita_gender.user_id = xlm_ita_gender.user_id.astype(float)

xlm_ita_age = pd.DataFrame(np.load('/g100_work/IscrC_mental/data/user_classification/trained_models/age/XLM_probs_age_test.npy'))
xlm_ita_age.columns = ['user_id', 'pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']
xlm_ita_age.user_id = xlm_ita_age.user_id.astype(float)

m3_it = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/m3_scores_bio_image.pkl')

cv_it = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/cv_models.pkl')
cv_it.user_id = cv_it.user_id.astype(float)

cv_de = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/german_age_and_gender.pkl')

test_set = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/data_for_models_test.pkl')
test_set.user_id = test_set.user_id.astype(float)


# remove extra instances from datasets
#xlm_ita_gender = test_set.merge(xlm_ita_gender, on='user_id', how='inner')
xlm_ita_age = test_set.merge(xlm_ita_age, on='user_id', how='inner')
m3_it = test_set.merge(m3_it, on='user_id', how='inner')
cv_it = test_set.merge(cv_it, on='user_id', how='inner')
# add missing rows in m3
m3_it = test_set.merge(m3_it, on='user_id', how='outer')
m3_it.loc[m3_it['score_male'].isna(), 'score_male'] = 0.5
m3_it.loc[m3_it['score_female'].isna(), 'score_female'] = 0.5
m3_it.loc[m3_it['score_age_cls_0'].isna(), 'score_age_cls_0'] = 0.25
m3_it.loc[m3_it['score_age_cls_0'].isna(), 'score_age_cls_0'] = 0.25
m3_it.loc[m3_it['score_age_cls_1'].isna(), 'score_age_cls_1'] = 0.25
m3_it.loc[m3_it['score_age_cls_2'].isna(), 'score_age_cls_2'] = 0.25
m3_it.loc[m3_it['score_age_cls_3'].isna(), 'score_age_cls_3'] = 0.25

In [208]:
# Assuming 'test_set' and 'xlm_ita_gender' are your DataFrames
test_set_user_ids = set(test_set['user_id'].astype(float))
xlm_ita_gender_user_ids = set(xlm_ita_gender['user_id'].astype(float))

# Find user IDs unique to 'test_set'
unique_to_test_set = test_set_user_ids.difference(xlm_ita_gender_user_ids)

# Find user IDs unique to 'xlm_ita_gender'
unique_to_xlm_ita_gender = xlm_ita_gender_user_ids.difference(test_set_user_ids)

# Combine the unique user IDs from both datasets
all_unique_user_ids = unique_to_test_set.union(unique_to_xlm_ita_gender)

# Create DataFrames with the unique user IDs
unique_test_set_df = test_set[test_set['user_id'].isin(unique_to_test_set)]
unique_xlm_ita_gender_df = xlm_ita_gender[xlm_ita_gender['user_id'].isin(unique_to_xlm_ita_gender)]

# Print the DataFrames containing unique user IDs
print("Unique User IDs in 'test_set':")
print(len(unique_to_test_set))
print("Unique User IDs in 'xlm_ita_gender':")
print(len(unique_to_xlm_ita_gender))
print("All unique User IDs:", len(all_unique_user_ids))
print("All unique User IDs:", len(set(test_set['user_id'].astype(int))))
print("All unique User IDs:", len(set(xlm_ita_gender['user_id'].astype(int))))

Unique User IDs in 'test_set':
0
Unique User IDs in 'xlm_ita_gender':
0
All unique User IDs: 0
All unique User IDs: 1119
All unique User IDs: 1119


In [209]:
def twitter_features(
    df,
    include_bio=True,
    include_tweets=True,
    label_name='is_male',
    ):
    # check if there are any missing values (shouldn't be the case)
    if df.isnull().values.any():
        raise ValueError('The dataframe contains missing values')
    # Read each bio and tweets concatenation, splitting them by \n and
    # joining by '. ' if sentences don't already end with a dot, else join by ' '
    if include_bio:
        bios = df.masked_bio.apply(lambda x: [text + '.' if not (text.endswith('.') or text.endswith('!') or text.endswith('?') or text.endswith(';')) else text for text in x.split('\n')]).apply(lambda x: ' '.join(x)).apply(lambda x: re.sub('\r', '', x)).tolist()
    if include_tweets:
        tweets = df.long_text.apply(lambda x: [text + '.' if not (text.endswith('.') or text.endswith('!') or text.endswith('?') or text.endswith(';')) else text for text in x.split('\n')]).apply(lambda x: ' '.join(x)).apply(lambda x: re.sub('\r', '', x)).tolist()
    if include_bio and include_tweets:
        # Join each tweet and bio by 'Bio: ' and 'Tweets: '
        input_texts = ['Bio: ' + bio + '\n' + 'Tweets: ' + tweet for bio, tweet in zip(bios, tweets)]
    elif include_bio:
        input_texts = ['Bio: ' + bio for bio in bios]
    elif include_tweets:
        input_texts = ['Tweets: ' + tweet for tweet in tweets]

    # Read the gold labels
    if label_name == 'is_male':
        gold_labels = df[label_name].tolist()
        gold_labels = ['male' if label == True else 'female' for label in gold_labels]
    if label_name == 'age':
        gold_labels = df[label_name].astype(int).tolist()
    if label_name == 'age_interval':
        # define age classes
        age_intervals = [0, 19, 30, 40, 100]
        age_labels = [0, 1, 2, 3]
        # Discretize the 'age' column into four classes
        gold_labels = pd.cut(df['age'], bins=age_intervals, labels=age_labels, right=False).astype('int').tolist()

    return input_texts, gold_labels

# Performance of majority-class dummy

In [210]:
_ , gold_labels = twitter_features(test_set)
print(Counter(gold_labels).most_common())
dum = ~(test_set.is_male*False)
print('Ac dummy:',accuracy_score(test_set.is_male, dum)*100)
print('F1 dummy:',f1_score(test_set.is_male, dum, average='macro')*100)

[('male', 720), ('female', 399)]
Ac dummy: 64.343163538874
F1 dummy: 39.151712887438826


In [211]:
_ , gold_labels = twitter_features(test_set, label_name='age_interval')
print(Counter(gold_labels).most_common())
dum = (np.array(gold_labels)*0)+3
print('Ac dummy:',accuracy_score(gold_labels, dum)*100)
print('F1 dummy:',f1_score(gold_labels, dum, average='macro')*100)

# compute average absolute error
mean_age_group1 = test_set[test_set.age<20].age.mean()
mean_age_group2 = test_set[(20<=test_set.age) & (test_set.age<30)].age.mean()
mean_age_group3 = test_set[(30<=test_set.age) & (test_set.age<40)].age.mean()
mean_age_group4 = test_set[(40<=test_set.age) & (test_set.age<=100)].age.mean()
class_means=[mean_age_group1, mean_age_group2, mean_age_group3, mean_age_group4]

if len(class_means) > 0:
    total_error = 0
    for i in range(len(gold_labels)):
        error = abs(class_means[3] - class_means[gold_labels[i]])
        total_error += error
    mean_error = total_error / len(gold_labels)
    
print('AAE dummy:', mean_error)

[(3, 572), (1, 236), (2, 220), (0, 91)]
Ac dummy: 51.117068811438784
F1 dummy: 16.9130691898285
AAE dummy: 14.79965061891894


In [234]:
def aggregate_performace(p1, p2, gold_labels, class_means=[]):
    
    gold_labels = np.array(gold_labels)

    # compute predictions
    pred1 = p1.argmax(axis=1) 
    pred2 = p2.argmax(axis=1) 
    
    # assign majority class as default label
    majority_class = Counter(gold_labels).most_common()[0][0]
    rows_with_same_p1=np.where(np.all(p1 == 1/len(np.unique(gold_labels)), axis=1))[0]
    rows_with_same_p2=np.where(np.all(p2 == 1/len(np.unique(gold_labels)), axis=1))[0]
    if rows_with_same_p1.size > 0:
        print(f'{rows_with_same_p1.size} system1 predictions substituted with majority_class')
        pred1[rows_with_same_p1] = majority_class
    if rows_with_same_p2.size > 0:
        print(f'{rows_with_same_p2.size} system2 predictions substituted with majority_class')
        pred2[rows_with_same_p2] = majority_class
    
    print('**********')

    # compute acc of models 1 and 2
    acc1 = accuracy_score(gold_labels, pred1)
    acc2 = accuracy_score(gold_labels, pred2)

    # compute f1 of models 1 and 2
    f11 = f1_score(gold_labels, pred1, average=None)
    f12 = f1_score(gold_labels, pred2, average=None)
    
    
    if len(class_means) > 0:
        total_error1 = 0
        total_error2 = 0
        for i in range(len(gold_labels)):
            error1 = abs(class_means[pred1[i]] - class_means[gold_labels[i]])
            error2 = abs(class_means[pred2[i]] - class_means[gold_labels[i]])
            total_error1 += error1
            total_error2 += error2
        mean_error1 = total_error1 / len(gold_labels)
        mean_error2 = total_error2 / len(gold_labels)

    print('System 1')
    print('Ac:', acc1*100)
    print('F1:', f11.mean()*100)
    if len(class_means) > 0:
        print('MAE:', mean_error1)
    print('----------')
    print('System 2')
    print('Ac:', acc2*100)
    print('F1:', f12.mean()*100)
    if len(class_means) > 0:
        print('MAE:', mean_error2)
    print('----------')

    # compute aggregated predictions
    p_agg = p1 + p2
    pred_agg = p_agg.argmax(axis=1)
    acc_agg = accuracy_score(gold_labels, pred_agg)
    f1agg = f1_score(gold_labels, pred_agg, average=None)
    if len(class_means) > 0:
        total_error = 0
        for i in range(len(gold_labels)):
            error = abs(class_means[pred_agg[i]] - class_means[gold_labels[i]])
            total_error += error
        mean_error = total_error / len(gold_labels)
    print('Avg prediction system ')
    print('Ac:', acc_agg*100)
    print('F1:', f1agg.mean()*100)
    if len(class_means) > 0:
        print('MAE:', mean_error)
    print('----------')
    
    p_agg = f11.mean()*p1 + f12.mean()*p2
    pred_agg = p_agg.argmax(axis=1)
    acc_agg = accuracy_score(gold_labels, pred_agg)
    f1agg = f1_score(gold_labels, pred_agg, average=None)
    if len(class_means) > 0:
        total_error = 0
        for i in range(len(gold_labels)):
            error = abs(class_means[pred_agg[i]] - class_means[gold_labels[i]])
            total_error += error
        mean_error = total_error / len(gold_labels)
    print('F1mean-weighted prediction system ')
    print('Ac:', acc_agg*100)
    print('F1:', f1agg.mean()*100)
    if len(class_means) > 0:
        print('MAE:', mean_error)
    print('----------')

#    p_agg = f11*p1 + f12*p2
#    pred_agg = p_agg.argmax(axis=1)
#    acc_agg = accuracy_score(gold_labels, pred_agg)
#    f1agg = f1_score(gold_labels, pred_agg, average=None)
#    if len(class_means) > 0:
#        total_error = 0
#        for i in len(gold_labels):
#            error = abs(class_means[pred_agg[i]] - class_means[gold_labels[i]])
#            total_error += error
#        mean_error = total_error / len(gold_labels)
#    print('F1-weighted prediction system ')
#    print('Ac:', acc_agg*100)
#    print('F1:', f1agg.mean()*100)
#    if len(class_means) > 0:
#        print('MAE:', mean_error)
#    print('----------')

# IT gender XLM+CV

In [235]:
# Assuming xlm_ita_gender is a Pandas Series or NumPy array
p1 = np.column_stack((xlm_ita_gender['p_is_male'], 1 - xlm_ita_gender['p_is_male']))

cv_it['p_is_male'] = cv_it.apply(lambda x: x.pred_gender_prob if bool(x.pred_is_male_label) else (1 - x.pred_gender_prob), axis=1)
p2 = np.column_stack((cv_it['p_is_male'], 1 - cv_it['p_is_male']))

gold_labels = (~test_set.is_male).astype(int)

aggregate_performace(p1, p2, gold_labels)

327 system2 predictions substituted with majority_class
**********
System 1
Ac: 88.11438784629134
F1: 86.84177720486235
----------
System 2
Ac: 75.96067917783735
F1: 70.64982424664947
----------
Avg prediction system 
Ac: 88.82931188561216
F1: 87.40755411713337
----------
F1mean-weighted prediction system 
Ac: 90.88471849865952
F1: 89.88998724489797
----------


# IT age XLM+CV

In [237]:
# Assuming xlm_ita_gender is a Pandas Series or NumPy array
p1 = np.array(xlm_ita_age[['pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']])
p2 = np.array(cv_it[['pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']])

_ , gold_labels = twitter_features(test_set, label_name='age_interval')

aggregate_performace(p1, p2, np.array(gold_labels), class_means=[mean_age_group1, mean_age_group2, mean_age_group3, mean_age_group4])

327 system2 predictions substituted with majority_class
**********
System 1
Ac: 66.39857015192135
F1: 60.15930806816667
MAE: 6.734919643762468
----------
System 2
Ac: 35.210008936550494
F1: 27.721998260576232
MAE: 16.244698119488422
----------
Avg prediction system 
Ac: 53.88739946380697
F1: 45.40926898500874
MAE: 10.076850951809869
----------
F1mean-weighted prediction system 
Ac: 64.96872207327972
F1: 57.39832210362758
MAE: 7.1666584028557185
----------


# IT gender XLM+M3

In [240]:
# Assuming xlm_ita_gender is a Pandas Series or NumPy array
p1 = np.column_stack((xlm_ita_gender['p_is_male'], 1 - xlm_ita_gender['p_is_male']))
p2 = np.array(m3_it[['score_male','score_female']])

gold_labels = (~test_set.is_male).astype(int)

aggregate_performace(p1, p2, gold_labels)

115 system2 predictions substituted with majority_class
**********
System 1
Ac: 88.11438784629134
F1: 86.84177720486235
----------
System 2
Ac: 83.10991957104558
F1: 79.34006527449577
----------
Avg prediction system 
Ac: 92.22520107238606
F1: 91.22374303262394
----------
F1mean-weighted prediction system 
Ac: 91.86773905272565
F1: 90.8202369651584
----------


# IT age XLM+M3

In [241]:
# Assuming xlm_ita_gender is a Pandas Series or NumPy array
p1 = np.array(xlm_ita_age[['pred_age_0_19_prob', 'pred_age_20_29_prob', 'pred_age_30_39_prob', 'pred_age_40_100_prob']])
p2 = np.array(m3_it[['score_age_cls_0','score_age_cls_1','score_age_cls_2','score_age_cls_3']])

_ , gold_labels = twitter_features(test_set, label_name='age_interval')

aggregate_performace(
    p1,
    p2,
    gold_labels,
    class_means=[mean_age_group1, mean_age_group2, mean_age_group3, mean_age_group4]
)

115 system2 predictions substituted with majority_class
**********
System 1
Ac: 66.39857015192135
F1: 60.15930806816667
MAE: 6.734919643762468
----------
System 2
Ac: 53.26184092940125
F1: 39.926094702728356
MAE: 11.866165374882817
----------
Avg prediction system 
Ac: 66.66666666666666
F1: 58.57988091767188
MAE: 7.1738892777204315
----------
F1mean-weighted prediction system 
Ac: 67.20285969615728
F1: 59.98813911025095
MAE: 6.877285973273143
----------
