In [1]:
import os 
import pandas as pd
import re

In [2]:
work_dir = '/g100_work/IscrC_mental'
wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')

### Maskig age/birthdate in bio 

In [3]:
AGE_CHAR_SUFFIX_SHORT = [
    'tredic',
    'quattordic',
    'quindic',
    'sedic',
    'diciasset',
    'diciott',
    'diciannov',
    'vent',
    'trent',
    'quarant',
    'cinquant',
    'sessant',
    'settant',
    'ottant',
    'novant']


In [4]:
def mask_bio(bio,regex_type, age_raw ,mask=' '): 
    if regex_type.endswith('digit'):
        masked_bio = bio.replace(str(age_raw), mask)

    elif regex_type.endswith('year'):
        # Remove the full year
        masked_bio = bio.replace(str(age_raw), mask)
        # Remove the last two digits of the year
        masked_bio = masked_bio.replace(str(age_raw)[-2:], mask)

    elif regex_type.endswith('chars'):
        # Identify the word to be removed based on the age value
        age_value = age_raw
        if 13 <= age_value <= 19:
            age_word = AGE_CHAR_SUFFIX_SHORT[age_value - 13]
        else:
            age_word = AGE_CHAR_SUFFIX_SHORT[(age_value // 10) + 5]

        # Use regex to replace the word in the bio string
        # The pattern ensures we match the word entirely and not just as a substring
        pattern = r'\b{}\w*\b'.format(age_word)
        masked_bio = re.sub(pattern, '', bio).strip()

    else: 
        raise ValueError("Unseen regex type in bio!")
            
    return masked_bio 



def mask_tweet(tweet,regex_type, age_raw ,mask=' '): 
    if regex_type.endswith('digit'):
        masked_tweet = tweet.replace(str(age_raw), mask)

    elif regex_type.endswith('year'):
        # Remove the full year
        masked_tweet = tweet.replace(str(age_raw), mask)
        # Remove the last two digits of the year
        masked_tweet = masked_tweet.replace(str(age_raw)[-2:], mask)

    elif regex_type.endswith('chars'):
        # Identify the word to be removed based on the age value
        age_value = age_raw
        if 13 <= age_value <= 19:
            age_word = AGE_CHAR_SUFFIX_SHORT[age_value - 13]
        else:
            age_word = AGE_CHAR_SUFFIX_SHORT[(age_value // 10) + 5]

        # Use regex to replace the word in the bio string
        # The pattern ensures we match the word entirely and not just as a substring
        pattern = r'\b{}\w*\b'.format(age_word)
        masked_bio = re.sub(pattern, '', tweet).strip()

    else: 
        raise ValueError("Unseen regex type in bio!")
            
    return masked_bio 
            

In [6]:
# read users data
path_train  = os.path.join(uc_dir, 'user_age_gender_location_train_set.pkl')
path_test  = os.path.join(uc_dir, 'user_age_gender_location_test_set.pkl')
df_agl_train = pd.read_pickle(path_train)
df_agl_test = pd.read_pickle(path_test)


In [7]:
for df_agl in (df_agl_train, df_agl_test):
    # Iterate over each row and apply the mask_bio function
    df_agl['masked_bio'] = df_agl['bio']
    df_agl['masked_tweet'] = df_agl['tweet']

    for idx, row in df_agl.iterrows():
        if row['regex_type'].startswith('bio'):
            df_agl.at[idx, 'masked_bio'] = mask_bio(row['bio'], row['regex_type'], row['age_raw'])

        else: 
            df_agl.at[idx, 'masked_tweet'] = mask_bio(row['tweet'], row['regex_type'], row['age_raw'])

In [8]:
# save
path_train  = os.path.join(uc_dir, 'user_age_gender_location_train_set_masked.pkl')
path_test  = os.path.join(uc_dir, 'user_age_gender_location_test_set_masked.pkl')
df_agl_train.to_pickle(path_train)
df_agl_test.to_pickle(path_test)

print(df_agl_train.shape)
print(df_agl_test.shape)

(19341, 39)
(1139, 39)


### Create Training Data

In [9]:
# load tweets
path = os.path.join(uc_dir, 'tweets_by_user_id_clean.pkl')
df_twt = pd.read_pickle(path)

print(df_twt.shape)
df_twt

(29304512, 8)


Unnamed: 0,tweet_id,user_id,created_at,language,likes,retweets,RT,text
0,1.394656e+18,234082042,2021-05-18 14:09:11+00:00,it,7,0,False,Primo giorno di lavoro. La vista non è per nie...
1,1.394656e+18,942291529,2021-05-18 14:09:12+00:00,it,1,0,False,Perché?? Perché soffre di protagonismo
2,1.394656e+18,1202227634,2021-05-18 14:09:34+00:00,it,1,0,False,💕💕💕ciaooo...💕💕💕 #selfie #myselfie #me #beauty ...
3,1.394656e+18,714908458,2021-05-18 14:09:48+00:00,it,0,0,False,giornalista del ......................... ??? ...
4,1.394656e+18,457410400,2021-05-18 14:09:53+00:00,it,1,0,False,Non li sopportava entrambi
...,...,...,...,...,...,...,...,...
31364033,5.025719e+17,1133667980,2014-08-21 21:44:09+00:00,it,0,0,False,Le photo in bianco e nero hanno il suo fascino...
31364034,5.025721e+17,319157858,2014-08-21 21:44:58+00:00,,0,0,False,Appena tornato
31364035,5.025722e+17,449935079,2014-08-21 21:45:35+00:00,it,3,0,False,Che quartetto 👭👫 💕 #loggia #fede #bilu #pippoB...
31364036,5.025723e+17,732762252,2014-08-21 21:45:59+00:00,it,0,0,False,Penso che sei bellissima .ma davvero tanto . \...


In [10]:
# create date column
df_twt['date'] = pd.to_datetime(df_twt['created_at'].str.slice(0,10))

# last_tweet year for each user
df_last = df_twt[['user_id', 'date']].sort_values(by=['user_id', 'date'], ascending=False)
df_last = df_last.drop_duplicates(subset='user_id', keep='first')
df_last.columns = ['user_id', 'last_tweet']
df_last['last_tweet'] = df_last['last_tweet'].dt.year

# discard retweets
df_twt = df_twt[df_twt['RT']==False]
# keep only it tweets:
# df_twt = df_twt[df_twt['language']=='it']


# keep selected columns
df_twt = df_twt[[ 'user_id', 'text', 'tweet_id', 'date', 'language']]

print(df_twt.shape)


(22315330, 5)


In [12]:
# load users masked data
path_train  = os.path.join(uc_dir, 'user_age_gender_location_train_set_masked.pkl')
path_test  = os.path.join(uc_dir, 'user_age_gender_location_test_set_masked.pkl')

df_um_train = pd.read_pickle(path_train)
df_um_test = pd.read_pickle(path_test)

print(df_um_train.shape)
print(df_um_test.shape)

# merge with last tweet data
df_um_train = df_um_train.merge(df_last, on='user_id', how='left')
df_um_test = df_um_test.merge(df_last, on='user_id', how='left')

print(df_um_train.shape)
print(df_um_test.shape)


(19341, 39)
(1139, 39)
(19341, 40)
(1139, 40)


In [20]:
for df_um in (df_um_train,df_um_test):
    # specify age based on matching tweet (when tweeted) or bio (last tweet)
    df_um['age'] = 0

    for idx, row in df_um.iterrows():
        # if bio: age = age at the year of the last tweet
        if row['regex_type'].startswith('bio'):
            if row['regex_type'].endswith('birth_year'):
                df_um.at[idx, 'age'] = row['last_tweet'] - int(row['age_raw'])
            else: 
                df_um.at[idx, 'age'] =  int(row['age_raw'])

        # if tweet; when tweeted   
        else: 
            df_um.at[idx, 'age'] = row['age_when_tweeted']

In [21]:
print(df_um_train.shape)
print(df_um_train['age'].isna().sum())
print(df_um_test['age'].isna().sum())

(19341, 41)
4
1


In [22]:

# merge bio and tweets

# merge selected columns only
columns= ['user_id', 'username' ,'full_name', 'is_male', 'age' , 'masked_bio']


# for train #########################################################
df_um = df_um_train

# merge with tweets
df_ut = df_um[columns].merge(df_twt, on='user_id', how='inner')

# merge with masked tweets
df_ut = df_ut.merge(df_um[df_um['masked_tweet'].notna()][['tweet_id', 'masked_tweet']], 
                    on='tweet_id', how='left')

# flag tweets that are masked
df_ut['has_mask'] = df_ut['masked_tweet'].notna().astype(int)

# sort by date
df_ut = df_ut.sort_values(by=['user_id', 'date'], ascending=False).reset_index(drop=True)
print('Train shape:', df_ut.shape)

# save
path  = os.path.join(uc_dir, 'data_for_models_train.pkl')
df_ut.to_pickle(path)



# for test #########################################################
df_um = df_um_test

# merge with tweets
df_ut = df_um[columns].merge(df_twt, on='user_id', how='inner')

# merge with masked tweets
df_ut = df_ut.merge(df_um[df_um['masked_tweet'].notna()][['tweet_id', 'masked_tweet']], 
                    on='tweet_id', how='left')

# flag tweets that are masked
df_ut['has_mask'] = df_ut['masked_tweet'].notna().astype(int)

# sort by date
df_ut = df_ut.sort_values(by=['user_id', 'date'], ascending=False).reset_index(drop=True)
print('Train shape:', df_ut.shape)

# save
path  = os.path.join(uc_dir, 'data_for_models_test.pkl')
df_ut.to_pickle(path)




Train shape: (20863369, 12)
Train shape: (1281731, 12)


Unnamed: 0,user_id,username,full_name,is_male,age,masked_bio,text,tweet_id,date,language,masked_tweet,has_mask
0,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#amici22 non mi sono mai dispiaciuta come stas...,1.639739e+18,2023-03-25,it,,0
1,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#amici22 auguro a Piccolo G una grande carriera,1.639739e+18,2023-03-25,it,,0
2,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#gfvip quest'anno è la più brutta edizione di ...,1.637951e+18,2023-03-20,it,,0
3,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,Che vergogna continua a dire che non ci crede ...,1.637951e+18,2023-03-20,it,,0
4,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,"#gfvip Giaele, Oriana e Micol",1.637952e+18,2023-03-20,it,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1281726,7071362,carloprevosti,carlo prevosti,True,37.0,Con le ruote per terra \r\nSentire il mio pass...,Pig Island - L’arte e la provocazione di Paul ...,1.562821e+10,2010-06-07,it,,0
1281727,7071362,carloprevosti,carlo prevosti,True,37.0,Con le ruote per terra \r\nSentire il mio pass...,Marina!,1.043299e+10,2010-03-13,tl,,0
1281728,7071362,carloprevosti,carlo prevosti,True,37.0,Con le ruote per terra \r\nSentire il mio pass...,Eccomi!,1.016789e+10,2010-03-08,it,,0
1281729,7071362,carloprevosti,carlo prevosti,True,37.0,Con le ruote per terra \r\nSentire il mio pass...,cerco di capirci qualcosa di Twitter!,5.154961e+09,2009-10-25,it,,0


In [24]:
df_ut.head()

Unnamed: 0,user_id,username,full_name,is_male,age,masked_bio,text,tweet_id,date,language,masked_tweet,has_mask
0,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#amici22 non mi sono mai dispiaciuta come stas...,1.639739e+18,2023-03-25,it,,0
1,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#amici22 auguro a Piccolo G una grande carriera,1.639739e+18,2023-03-25,it,,0
2,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#gfvip quest'anno è la più brutta edizione di ...,1.637951e+18,2023-03-20,it,,0
3,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,Che vergogna continua a dire che non ci crede ...,1.637951e+18,2023-03-20,it,,0
4,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,"#gfvip Giaele, Oriana e Micol",1.637952e+18,2023-03-20,it,,0


# Concat all tweets together

In [1]:
import pandas as pd

#df = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/data_for_models_train.pkl')
#df = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/data_for_models_test.pkl')
df = pd.read_pickle('../data/user_classification/data_for_models_test.pkl')


In [2]:
df.head()

Unnamed: 0,user_id,username,full_name,is_male,age,masked_bio,text,tweet_id,date,language,masked_tweet,has_mask
0,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#amici22 non mi sono mai dispiaciuta come stas...,1.639739e+18,2023-03-25,it,,0
1,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#amici22 auguro a Piccolo G una grande carriera,1.639739e+18,2023-03-25,it,,0
2,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,#gfvip quest'anno è la più brutta edizione di ...,1.637951e+18,2023-03-20,it,,0
3,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,Che vergogna continua a dire che non ci crede ...,1.637951e+18,2023-03-20,it,,0
4,1604232779406909442,Manu5volley,emanuela albano,False,47.0,,"#gfvip Giaele, Oriana e Micol",1.637952e+18,2023-03-20,it,,0


In [15]:
def transform_df(df, N=100):
    # df bio
    df_bio = df[['user_id', 'masked_bio', 'age', 'is_male']].fillna('').drop_duplicates()
    
    # process tweets
    df_text = df[['user_id', 'text', 'masked_tweet', 'has_mask', 'date', 'age', 'is_male']]
    
    #  if has mask, use the masked_version
    df_text['text_masked'] = df_text.apply(lambda x: x['text'] if x['has_mask'] == 0 else x['masked_tweet'], axis=1)
    
    # keep the N most recent text_masked
    df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)
    
    # Filter out entries with rank greater than N
    df_text = df_text[df_text['rank'] <= N]
    
    df_text_grouped = df_text.groupby('user_id')['text_masked'].agg(lambda x: '\n'.join(x)).reset_index()
    df_text_grouped = df_text_grouped.rename(columns={'text_masked': 'long_text'})
    
    # Merge df_bio with df_text_grouped
    result_df = pd.merge(df_bio, df_text_grouped, on='user_id', how='inner')

    return result_df

dft = transform_df(df, N=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text_masked'] = df_text.apply(lambda x: x['text'] if x['has_mask'] == 0 else x['masked_tweet'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)


In [17]:
# find duplicates in dft
dft[dft.duplicated(subset=['user_id'], keep=False)].sort_values(by=['user_id'])

Unnamed: 0,user_id,masked_bio,age,is_male,long_text
138,930407131521736705,Ciao da Giuseppe Spinoso ho anni Mi piacerebb...,49.0,True,Salve signor Silvio Berlusconi mi chiamo Giuse...
139,930407131521736705,Ciao da Giuseppe Spinoso ho 49anni Mi piacereb...,49.0,True,Salve signor Silvio Berlusconi mi chiamo Giuse...


In [21]:
for i in df[df["user_id"] == 930407131521736705].masked_bio:
    print(i)

Ciao da Giuseppe Spinoso ho  anni Mi piacerebbe molto lavorare e finire di Lavorare alle Spalle ho 33.anni di lavoro sé l'è gette il mio profilo grazie mille
Ciao da Giuseppe Spinoso ho  anni Mi piacerebbe molto lavorare e finire di Lavorare alle Spalle ho 33.anni di lavoro sé l'è gette il mio profilo grazie mille
Ciao da Giuseppe Spinoso ho  anni Mi piacerebbe molto lavorare e finire di Lavorare alle Spalle ho 33.anni di lavoro sé l'è gette il mio profilo grazie mille
Ciao da Giuseppe Spinoso ho 49anni Mi piacerebbe molto lavorare e finire di Lavorare alle Spalle ho 33.anni di lavoro sé l'è gette il mio profilo grazie mille
Ciao da Giuseppe Spinoso ho 49anni Mi piacerebbe molto lavorare e finire di Lavorare alle Spalle ho 33.anni di lavoro sé l'è gette il mio profilo grazie mille
Ciao da Giuseppe Spinoso ho 49anni Mi piacerebbe molto lavorare e finire di Lavorare alle Spalle ho 33.anni di lavoro sé l'è gette il mio profilo grazie mille
Ciao da Giuseppe Spinoso ho  anni Mi piacerebbe m