In [93]:
import sqlite3
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

# Evaluate predictions on test set

In [54]:
df_gen = pd.read_csv('../tmp/twitter-xlm-large-user-gender-it-extra_1/predictions.csv')
df_age = pd.read_csv('../tmp/twitter-xlm-large-user-age-5g-it-extra/predictions.csv')

In [61]:
df_test = pd.read_pickle('../data/user_classification/data_for_models_test.pkl')
df_test.is_male = df_test.is_male.astype(int)

In [62]:
df_test_gen = df_test.merge(df_gen, on='user_id', how='inner')
df_test_age = df_test.merge(df_age, on='user_id', how='inner')

In [91]:
print("Gen acc:", accuracy_score(df_test_gen.is_male, df_test_gen.prediction))
print("Gen f11:", f1_score(df_test_gen.is_male, df_test_gen.prediction, average='binary'))

age_intervals = [0, 18, 30, 40, 60, 100]
age_labels = [0, 1, 2, 3, 4]
test_age_class = pd.cut(df_test_age['age'], bins=age_intervals, labels=age_labels, right=False).astype(int)

print("Age acc:", accuracy_score(test_age_class, df_test_age.prediction))
print("Age f11:", f1_score(test_age_class, df_test_age.prediction, average='macro'))


Gen acc: 0.990530303030303
Gen f11: 0.9926578560939794
Age acc: 0.6325757575757576
Age f11: 0.6490927439696905


# Add predictions to DB

In [126]:
db_path = '/data/mentalism/data/database/MENTALISM.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# Query the user_regioncoded table into a pandas DataFrame
query = "SELECT * FROM user_regioncoded;"
user_regioncoded_df = pd.read_sql_query(query, conn)

conn.close()

In [127]:
print(user_regioncoded_df.shape)
user_regioncoded_df.head(3)

(462533, 26)


Unnamed: 0,user_id,username,full_name,location,join_year,join_month,join_day,bio,tweets,following,...,region_pos,region,term_for_italy,name_city_engl,condition,city_id,all_cities,city_pos,region_code,male_name
0,9269,simon,simone brunozzi,"Ceres, Solar System, Milky Way",2006,10,17,Tech. Founder. Investor. Trying to become usel...,14357,1599,...,,piemonte,,,0.0,1072,|ceres|,0.0,1.0,b'\x01'
1,11193,robgarofalo,roberto garofalo,"Roma, Lazio",2006,10,31,,1,24,...,5.0,lazio,,,0.0,58091,|roma|,0.0,12.0,b'\x01'
2,11881,Agaponeo,cristian bracci,Rome,2006,11,9,"Sono una persona normale, padre di una belliss...",4150,131,...,,lazio,,rome,0.0,58091,|roma|,0.0,12.0,b'\x01'


In [128]:
df_age

Unnamed: 0,user_id,prediction,prob_class0,prob_class1,prob_class2,prob_class3,prob_class4
0,9269,3,0.003464,0.039950,0.462047,0.484028,0.010512
1,11193,4,0.001903,0.001107,0.003761,0.270335,0.722894
2,11881,3,0.000061,0.000476,0.043501,0.948263,0.007699
3,12243,3,0.003095,0.022662,0.207193,0.641001,0.126049
4,14983,3,0.009165,0.017483,0.116506,0.729665,0.127181
...,...,...,...,...,...,...,...
462528,1618987257653870592,4,0.003286,0.005673,0.013811,0.311168,0.666061
462529,1619403522004377600,3,0.008588,0.046595,0.084914,0.472262,0.387641
462530,1619481480756879361,4,0.016361,0.031280,0.015239,0.159434,0.777687
462531,1619640767785336833,4,0.000415,0.003726,0.013538,0.366528,0.615792


In [151]:
# Merge user_regioncoded_df with gender_df on 'user_id'
merged_df_gen = pd.merge(
    user_regioncoded_df,
    df_gen.rename(columns={'prediction': 'predicted_gender', 'prob_class0': 'prob_class0_gender', 'prob_class1': 'prob_class1_gender'}, inplace=False),
    on='user_id',
    how='left'
)
merged_df_gen_age = pd.merge(
    merged_df_gen,
    df_age.rename(columns={'prediction': 'predicted_age', 'prob_class0': 'prob_class0_age', 'prob_class1': 'prob_class1_age', 'prob_class2': 'prob_class2_age', 'prob_class3': 'prob_class3_age', 'prob_class4': 'prob_class4_age'}, inplace=False),
    on='user_id',
    how='left'
)

In [152]:
merged_df_gen_age

Unnamed: 0,user_id,username,full_name,location,join_year,join_month,join_day,bio,tweets,following,...,male_name,predicted_gender,prob_class0_gender,prob_class1_gender,predicted_age,prob_class0_age,prob_class1_age,prob_class2_age,prob_class3_age,prob_class4_age
0,9269,simon,simone brunozzi,"Ceres, Solar System, Milky Way",2006,10,17,Tech. Founder. Investor. Trying to become usel...,14357,1599,...,b'\x01',1,5.264990e-08,1.000000,3,0.003464,0.039950,0.462047,0.484028,0.010512
1,11193,robgarofalo,roberto garofalo,"Roma, Lazio",2006,10,31,,1,24,...,b'\x01',1,9.308604e-09,1.000000,4,0.001903,0.001107,0.003761,0.270335,0.722894
2,11881,Agaponeo,cristian bracci,Rome,2006,11,9,"Sono una persona normale, padre di una belliss...",4150,131,...,b'\x01',1,1.241776e-06,0.999999,3,0.000061,0.000476,0.043501,0.948263,0.007699
3,12243,pecus,matteo fogli,"Torino, Italia",2006,11,13,CEO of @madebymodo under disguise. Obssessed w...,5543,1956,...,b'\x01',1,7.798503e-08,1.000000,3,0.003095,0.022662,0.207193,0.641001,0.126049
4,14983,FAFrigenti,francesco a frigenti,"Milan, Lombardy",2006,11,22,,397,180,...,b'\x01',1,5.048017e-08,1.000000,3,0.009165,0.017483,0.116506,0.729665,0.127181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462528,1618987257653870592,enrico_pregl,enrico pregl,"Ledro, Trentino-Alto Adige",2023,1,27,,2,20,...,b'\x01',1,1.939232e-08,1.000000,4,0.003286,0.005673,0.013811,0.311168,0.666061
462529,1619403522004377600,Su_La_Testa_tw,su la testa,Cusano Milanino,2023,1,28,L’Associazione rivolta a chiunque voglia impeg...,57,5,...,,1,3.223819e-03,0.996776,3,0.008588,0.046595,0.084914,0.472262,0.387641
462530,1619481480756879361,FShayagan,fatima shayagan,"Torino, Piemonte",2023,1,28,زنده باد انسانیت,1,17,...,,0,9.999985e-01,0.000002,4,0.016361,0.031280,0.015239,0.159434,0.777687
462531,1619640767785336833,antonio69094920,antonio antonelli,Molise,2023,1,29,"Nell’ etica del rispetto della Giustizia , del...",7,48,...,b'\x01',1,9.365611e-09,1.000000,4,0.000415,0.003726,0.013538,0.366528,0.615792


In [165]:
a = merged_df_gen_age.male_name.to_numpy()
print(a[-1] is None)

True


In [170]:
# Function to convert bytes to int, handling None values
def convert_to_int(value):
    if value is None:
        return None
    return int.from_bytes(value, byteorder='big')

# Apply the conversion function to each element in the array
merged_df_gen_age.male_name = np.array([convert_to_int(item) for item in merged_df_gen_age.male_name.to_numpy()])

In [177]:
db_path = '/scratch/mentalism/data/database/MENTALISM.db'
conn = sqlite3.connect(db_path)

cursor = conn.cursor()
cursor.execute('ALTER TABLE user_regioncoded RENAME TO user_regioncoded_old')

merged_df_gen_age.to_sql('user_regioncoded', conn, index=False, if_exists='replace')

# Split the DataFrame into chunks
#chunk_size = 10000
#chunks_df = np.array_split(merged_df_gen_age, len(merged_df_gen_age) // chunk_size + 1)

# Update the SQLite table with the merged DataFrame
#for chunk in chunks_df:
#    chunk[['user_id', 'predicted_gender', 'prob_class0_gender', 'prob_class1_gender', 'predicted_age', 'prob_class0_age', 'prob_class1_age', 'prob_class2_age', 'prob_class3_age', 'prob_class4_age']].to_sql('user_regioncoded', conn, if_exists='append', index=False, method='multi')

# Commit changes and close the connection
conn.commit()
conn.close()

# Evaluate gender predictions against all names

In [193]:
print("Gen acc:", accuracy_score(merged_df_gen_age[~merged_df_gen_age.male_name.isna()].male_name.astype(int), merged_df_gen_age[~merged_df_gen_age.male_name.isna()].predicted_gender))
print("Gen f11:", f1_score(merged_df_gen_age[~merged_df_gen_age.male_name.isna()].male_name.astype(int), merged_df_gen_age[~merged_df_gen_age.male_name.isna()].predicted_gender))


Gen acc: 0.998520446421666
Gen f11: 0.998875853329518
