## Create datasets for publishing

In [1]:
import numpy as np
import pandas as pd
import os
import hashlib

In [2]:
uc_dir = '/g100_work/IscrC_mental/data/user_classification/'
users_data = '/g100_work/IscrC_mental/data/database/user_geocoded.parquet'



In [3]:
# Function to anonymize user_id using SHA-256
def anonymize_user_id(user_id):
    return hashlib.sha256(str(user_id).encode()).hexdigest()

# Training and test dataset of users + features

In [15]:
# build training dataset
path = uc_dir + 'data_for_models_train.pkl'
df = pd.read_pickle(path)
df_users = pd.read_parquet(users_data)
df_users = df_users[['user_id', 'username', 'full_name', 'location', 'join_year',
       'join_month', 'join_day', 'bio', 'tweets', 'following', 'followers',
       'likes', 'foreign_country', 'region_code']]

#merge 
df_users = df_users[df_users['user_id'].isin(df['user_id'].values)]
df_for_models = df.merge(df_users, on='user_id', how='left')
df_for_models.to_parquet(uc_dir + 'data_for_models_train.parquet')


#anonymize and save for publication
df_for_models.drop(columns=['username', 'full_name'], inplace=True)
df_for_models['user_id'] = df_for_models['user_id'].apply(anonymize_user_id)
df_for_models.to_parquet(uc_dir + 'data_for_publishing/data_for_models_train.parquet')

print(df.shape)
print(df_users.shape)
print(df_for_models.shape)

df_for_models

(19200, 5)
(19200, 14)
(19200, 16)


Unnamed: 0,user_id,is_male,age,masked_bio,long_text,location,join_year,join_month,join_day,bio,tweets,following,followers,likes,foreign_country,region_code
0,0632dc7a43b1df9806c5a0689b2ea6108debdd99707644...,True,55,"Mac user since 1991, working on ICT, physicist...","Coraggio, fra 7 mesi sarà il 25 aprile.\nIn te...","Bologna, Emilia-Romagna, Italy",2006,11,26,"Mac user since 1991, working on ICT, physicist...",20544,1019,12711,24707,,8.0
1,a2fef7ddbcf10e4da73a2ad5ec18cfa9c3cdf25f29fdd1...,True,49,Partner di distribuzione Reico. Alimenti di qu...,Assistenza Nissan pessima! Mai più Nissan in v...,"Alba Adriatica, Abruzzo",2007,1,4,Partner di distribuzione Reico. Alimenti di qu...,11670,989,3260,4021,,13.0
2,5093106b2c2dda63b4c77a98a03824a7e8785be51d042c...,True,28,,Ho finalmente trovato il tempo di ascoltare l'...,,2007,1,6,Una volta ho vomitato.,58980,776,3799,79,,
3,814de8350c68b4a0f4299841637e3f6e95d1f5f04aaa69...,True,39,Avvocato digitale.\r\nDigital Lawyer.,La foto ritrae ciò che posso mangiare dopo ave...,Taranto,2007,1,8,Avvocato digitale.\r\nDigital Lawyer.,12144,1001,667,2691,,16.0
4,615592a0a79498848ea51557be6be44d9d1f18dfeef6c1...,True,42,"Ingegnere informatico, 20 anni di esperienza i...",Perfetto.\nCompagni atei anticlericali non app...,Benevento,2007,1,8,"Ingegnere informatico, 20 anni di esperienza i...",13134,1269,1055,6671,,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19195,50f30a6f40b4c1ee4959f07f99b2d8ae788a2cfeb87130...,True,22,,Secondo me l'ha capito meglio di tutti.\nLeone...,"Salerno, Campania",2023,1,2,,440,43,16,294,,15.0
19196,ae8c7e447053fd88d33b5cabea37e9cb9bb612be21718c...,False,64,,"Ma che grande stronzata,il vino è teschio ai m...",,2023,1,5,"ho \n64 anni, vedova,con figlio. ...",48,364,70,99,,
19197,9c614370c95e29a0a3db583a414080e047b243521deb82...,True,40,"Odio l'omofobia, i razzisti, i Novax. Non toll...",", grazie per un'altra bellissima puntata.\nSie...",,2023,1,8,"Odio l'omofobia, i razzisti, i Novax. Non toll...",757,110,34,364,,
19198,534d49bb9f300e14af9bbd948b600f3edc9abc669c648e...,True,63,,Ma questa è la vostra idea di opposizione?? Gl...,,2023,1,13,,29,121,10,40,,


In [17]:
# build test dataset
path = uc_dir + 'data_for_models_test.pkl'
df = pd.read_pickle(path)
df_users = pd.read_parquet(users_data)
df_users = df_users[['user_id', 'username', 'full_name', 'location', 'join_year',
       'join_month', 'join_day', 'bio', 'tweets', 'following', 'followers',
       'likes', 'foreign_country', 'region_code']]

#merge 
df_users = df_users[df_users['user_id'].isin(df['user_id'].values)]
df_for_models = df.merge(df_users, on='user_id', how='left')
df_for_models.to_parquet(uc_dir + 'data_for_models_test.parquet')


# Function to anonymize user_id using SHA-256
def anonymize_user_id(user_id):
    return hashlib.sha256(str(user_id).encode()).hexdigest()

#anonymize and save for publication
df_for_models.drop(columns=['username', 'full_name'], inplace=True)
df_for_models['user_id'] = df_for_models['user_id'].apply(anonymize_user_id)
df_for_models.to_parquet(uc_dir + 'data_for_publishing/data_for_models_test.parquet')

print(df.shape)
print(df_users.shape)
print(df_for_models.shape)

df_for_models

(1119, 5)
(1119, 14)
(1119, 16)


Unnamed: 0,user_id,is_male,age,masked_bio,long_text,location,join_year,join_month,join_day,bio,tweets,following,followers,likes,foreign_country,region_code
0,04ed4cbff32d86966caafd65acbb9258f26d221a9019b2...,True,47,Con le ruote per terra \r\nSentire il mio pass...,Lo prendo come un attestato di stima\nPer la s...,Milano,2007,6,25,Con le ruote per terra \r\nSentire il mio pass...,1846,1606,956,490,,3.0
1,28aa0dfce73b8391bbb19398eee5e9a67218c8100ce31e...,False,31,Quello che gli altri pensano della tua persona...,Lovely zucca\nSta cambiando proprio il tempo\n...,Italy,2007,10,13,Quello che gli altri pensano della tua persona...,3510,275,152,17,,
2,6d148444a9742bf0ea5030df6b7ad62742ea53e8da5a47...,False,33,Love me? Great. Hate me? Even better. Think I'...,Guardi l’ottimo è uno dei motivi per cui ancor...,"Cislago, Lombardia",2008,1,2,Love me? Great. Hate me? Even better. Think I'...,2834,348,244,73,,3.0
3,506c95404e5d10b5d7b51d1143b2e8032904f2b610744f...,False,41,Sii il cambiamento che vuoi vedere nel mondo M...,"Io direi per lui, se generalizziamo ci dividia...",Camponogara,2008,3,6,Sii il cambiamento che vuoi vedere nel mondo M...,1611,335,446,515,,5.0
4,8caebba74ccee420929c55f4abfa77de8cfa080491f250...,True,44,at home I feel like a tourist.,"Ore 6, buio pesto. Ribadiamo con forza #TeamOr...",Aleph-on-Sine,2008,4,2,at home I feel like a tourist.,19556,315,1508,21778,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,02b043d25f94e1ed44181c0ee54b6feeeeb3d09a15c568...,True,19,,Get Fortn1t3 from App Valley!,,2022,2,12,Piacere mi chiamo Maurizio Mennella e ho 19 an...,1,127,10,0,,
1115,573ea7d7b41f33717c00396b3d612e3f17ac7f1a3d77e7...,False,33,,Lei non poteva andare perche e personaggio pub...,,2022,3,6,,102,57,10,14471,,
1116,841b48732002157ede869a20f68c66fe0e0f0b3b566bfd...,True,57,,E tantissimi sono contro la GIÒ MELON DONNA FA...,,2022,4,6,,0,18,2,0,,
1117,690c5641e89994476815e2bbac3f78d65881a158ce47c1...,False,19,𝘕𝘢𝘳𝘤𝘰𝘭𝘦𝘱𝘴𝘺 𝘨𝘰𝘵 𝘮𝘦 𝘧𝘦𝘦𝘭𝘪𝘯𝘨 𝘴𝘵𝘢𝘨𝘦 𝘧𝘳𝘪𝘨𝘩𝘵\n\n✧˚ ༘...,"È stato un brutto sogno e basta, vero?\nÈ sett...",1q84,2022,10,7,𝘕𝘢𝘳𝘤𝘰𝘭𝘦𝘱𝘴𝘺 𝘨𝘰𝘵 𝘮𝘦 𝘧𝘦𝘦𝘭𝘪𝘯𝘨 𝘴𝘵𝘢𝘨𝘦 𝘧𝘳𝘪𝘨𝘩𝘵\n\n✧˚ ༘...,214,29,13,3349,,


## Tweets dataset with tweets' features
not used in exps but might be useful if people don't want to concatenate tweets

In [4]:
path = uc_dir + 'data_for_publishing/train.parquet'
df = pd.read_parquet(path)
print(df.shape)

df = df[['user_id', 'tweet_id', 'created_at', 'RT',
       'likes', 'retweets', 'text']]
df.to_parquet(uc_dir + 'train_tweets.parquet')
print(df.shape)

#anonymize and save for publication
df['user_id'] = df['user_id'].apply(anonymize_user_id)
df['tweet_id'] = df['tweet_id'].apply(anonymize_user_id)
df.to_parquet(uc_dir + 'data_for_publishing/train_tweets.parquet')
print(df.shape)

df

(27302637, 15)
(27302637, 7)
(27302637, 7)


Unnamed: 0,user_id,tweet_id,created_at,RT,likes,retweets,text
0,0632dc7a43b1df9806c5a0689b2ea6108debdd99707644...,53aa8f3554beb9cd4a24e94d944e8a56476990b13cfef9...,2021-05-18 19:35:20+00:00,False,1,0,Stasera si segue il primo dibattito sulle prim...
1,0632dc7a43b1df9806c5a0689b2ea6108debdd99707644...,99ab12f6c4852c69949409499d5cf497b04d1b5079f6e5...,2021-05-20 19:33:45+00:00,False,2,0,Stasera si segue il secondo dibattito sulle pr...
2,0632dc7a43b1df9806c5a0689b2ea6108debdd99707644...,c8a5331cd9c98d01b650a68e1021cc2fd4a67e29972b6a...,2021-05-22 13:59:11+00:00,False,0,0,La chiamano “rassegna stampa” per tenere profi...
3,0632dc7a43b1df9806c5a0689b2ea6108debdd99707644...,fef1bc330f3a181bb7fa115af592f1dfe7e0c8acf96b5c...,2021-05-22 22:15:53+00:00,False,3,0,Zero punti a UK? #Eurovision #brexit edition!
4,0632dc7a43b1df9806c5a0689b2ea6108debdd99707644...,44199ff108f32d981623f7b3f5a1e4e9611e1b1e0058ae...,2021-05-22 22:35:41+00:00,False,1,0,“This is a GOMBLOT!”
...,...,...,...,...,...,...,...
27302632,592c9d66a9bd3a07adb19e55428c25eb6881efdeb6d417...,ffdb6389894e61f0c1f2e4a7e37369ac69c97cffc72c9e...,2021-12-16 00:19:16+00:00,False,0,0,Sei bellissima
27302633,592c9d66a9bd3a07adb19e55428c25eb6881efdeb6d417...,8f60d107cd7761aa5c03f56e38809886af319557c067cc...,2021-12-16 00:20:54+00:00,False,0,0,Io mi chiamo Alberto e sono italiano di Roma e...
27302634,592c9d66a9bd3a07adb19e55428c25eb6881efdeb6d417...,cd62d827f8664ac4feeea806d7bc1140c4b52f2f57b337...,2021-12-16 01:14:59+00:00,False,0,0,Tanti auguri di Buon Compleanno .Quanti anni fai?
27302635,592c9d66a9bd3a07adb19e55428c25eb6881efdeb6d417...,5f5148f2118e9b54ffa8f9bd86bb0e653313009bc62970...,2021-12-18 00:40:42+00:00,False,0,0,Ciao piacere di conoscerti mi chiamo Alberto s...


In [5]:
path = uc_dir + 'data_for_publishing/test.parquet'
df = pd.read_parquet(path)
print(df.shape)

df = df[['user_id', 'tweet_id', 'created_at', 'RT',
       'likes', 'retweets', 'text']]
df.to_parquet(uc_dir + 'test_tweets.parquet')
print(df.shape)

#anonymize and save for publication
df['user_id'] = df['user_id'].apply(anonymize_user_id)
df['tweet_id'] = df['tweet_id'].apply(anonymize_user_id)
df.to_parquet(uc_dir + 'data_for_publishing/test_tweets.parquet')
print(df.shape)

df

(1571178, 15)
(1571178, 7)
(1571178, 7)


Unnamed: 0,user_id,tweet_id,created_at,RT,likes,retweets,text
0,5b5c01c68089b43b76bebfb4715f28fd9c3de023819993...,132b55ee051057bf9345a326b2999cf46cce68874c5eb8...,2021-06-07 17:03:12+00:00,False,29,1,"Meglio fidarsi del proprio istinto, mio padre ..."
1,5b5c01c68089b43b76bebfb4715f28fd9c3de023819993...,c57c3990a0bb53374dc14e0c9e8dd6ebcca32eee70d14b...,2021-07-05 20:48:35+00:00,False,5,0,Mi hai aperto una ferita …avevo accantonato il...
2,5b5c01c68089b43b76bebfb4715f28fd9c3de023819993...,b452e01aa675c4146951ce7b27ce2ba4c00b9d7752a474...,2021-07-25 19:21:41+00:00,False,0,0,O in Sicilia
3,5b5c01c68089b43b76bebfb4715f28fd9c3de023819993...,efbe267b8c041a0039c6daabf53f1ca6b005019b5613ac...,2021-07-29 08:48:28+00:00,False,0,0,"Salve, avrei bisogno di contattare un operator..."
4,5b5c01c68089b43b76bebfb4715f28fd9c3de023819993...,6dc82b4d014005380540f2528ef082fe744b6326a00bb7...,2015-03-20 07:54:48+00:00,True,0,15,L’eclissi accende i prezzi dell’energia
...,...,...,...,...,...,...,...
1571173,bb678485ce6e690a7bf72e86960b88d8b1360086cef3ab...,43b320b461bced5441c0b228de41ac29c8cbc851f09141...,2017-12-25 06:03:19+00:00,False,0,0,Salve signor Silvio Berlusconi io e mia moglie...
1571174,bb678485ce6e690a7bf72e86960b88d8b1360086cef3ab...,59f17601e857292052562643386560288e057ef4533921...,2018-01-11 13:05:32+00:00,False,1,1,Salve signor Silvio Berlusconi mi chiamo Giuse...
1571175,bb678485ce6e690a7bf72e86960b88d8b1360086cef3ab...,56828c2266abf03a8c97983f4a2dce51b85ee83720af1a...,2018-01-11 13:12:55+00:00,False,1,0,Sé c'è qualche persona che mi possono dare una...
1571176,bb678485ce6e690a7bf72e86960b88d8b1360086cef3ab...,2d67435fba6a1aac80f398bde9d897b4b7cbd8f78751c0...,2018-01-11 13:17:26+00:00,False,1,0,Ciao sono sempre in sono su WhatsApp lo stato ...


## Hash profile pics filenames

In [6]:
# Function to anonymize filenames using SHA-256
def anonymize_filename(filename):
    name, ext = os.path.splitext(filename)
    hashed_name = hashlib.sha256(name.encode()).hexdigest()
    return f"{hashed_name}{ext}"

# Directory containing the images
image_directory = uc_dir + "data_for_publishing/profile_pics/test/"

# List all files in the directory
files = os.listdir(image_directory)

# Process each file
for filename in files:
    # Get the full path of the current file
    full_path = os.path.join(image_directory, filename)
    
    # Skip if it's not a file
    if not os.path.isfile(full_path):
        continue

    # Generate the anonymized filename
    anonymized_name = anonymize_filename(filename)
    
    # Get the full path for the new filename
    anonymized_path = os.path.join(image_directory, anonymized_name)
    
    # Rename the file
    os.rename(full_path, anonymized_path)

print("Filenames have been anonymized.")

Filenames have been anonymized.


In [7]:
# Directory containing the images
image_directory = uc_dir + "data_for_publishing/profile_pics/train/"

# List all files in the directory
files = os.listdir(image_directory)

# Process each file
for filename in files:
    # Get the full path of the current file
    full_path = os.path.join(image_directory, filename)
    
    # Skip if it's not a file
    if not os.path.isfile(full_path):
        continue

    # Generate the anonymized filename
    anonymized_name = anonymize_filename(filename)
    
    # Get the full path for the new filename
    anonymized_path = os.path.join(image_directory, anonymized_name)
    
    # Rename the file
    os.rename(full_path, anonymized_path)

print("Filenames have been anonymized.")

Filenames have been anonymized.
