In [1]:
import os 
import pandas as pd
import re

In [2]:
work_dir = '/g100_work/IscrC_mental'
wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')

### Maskig age/birthdate in bio 

In [3]:
AGE_CHAR = [
    'novantanove',
    'novantotto',
    'novantasette',
    'novantasei',
    'novantacinque',
    'novantaquattro',
    'novantatre',
    'novantadue',
    'novantuno',
    'novanta',
    'ottantanove',
    'ottantotto',
    'ottantasette',
    'ottantasei',
    'ottantacinque',
    'ottantaquattro',
    'ottantatre',
    'ottantadue',
    'ottantuno',
    'ottanta',
    'settantanove',
    'settantotto',
    'settantasette',
    'settantasei',
    'settantacinque',
    'settantaquattro',
    'settantatre',
    'settantadue',
    'settantuno',
    'settanta',
    'sessantanove',
    'sessantotto',
    'sessantasette',
    'sessantasei',
    'sessantacinque',
    'sessantaquattro',
    'sessantatre',
    'sessantadue',
    'sessantuno',
    'sessanta',
    'cinquantanove',
    'cinquantotto',
    'cinquantasette',
    'cinquantasei',
    'cinquantacinque',
    'cinquantaquattro',
    'cinquantatre',
    'cinquantadue',
    'cinquantuno',
    'cinquanta',
    'quarantanove',
    'quarantotto',
    'quarantasette',
    'quarantasei',
    'quarantacinque',
    'quarantaquattro',
    'quarantatre',
    'quarantadue',
    'quarantuno',
    'quaranta',
    'trentanove',
    'trentotto',
    'trentasette',
    'trentasei',
    'trentacinque',
    'trentaquattro',
    'trentatre',
    'trentadue',
    'trentuno',
    'trenta',
    'ventinove',
    'ventotto',
    'ventisette',
    'ventisei',
    'venticinque',
    'ventiquattro',
    'ventitre',
    'ventidue',
    'ventuno',
    'venti',
    'diciannove',
    'diciotto',
    'diciassette',
    'sedici',
    'quindici',
    'quattordici',
    'tredici'
 ]

# remove last letter of each years_in_words entry, in order to match both
# the noun ("ventiquattro") and the adjective ("ventiquattrenne")
AGE_CHAR_SUFFIX_LONG = [year[:-1] for year in AGE_CHAR]

# keep only the shortest form as a first filter
AGE_CHAR_SUFFIX_SHORT = [
    "tredic",
    "quattordic",
    "quindic",
    "sedic",
    "diciasset",
    "diciott",
    "diciannov",
    "vent",
    "trent",
    "quarant",
    "cinquant",
    "sessant",
    "settant",
    "ottant",
    "novant",
]

AGE_DIGIT = list(range(99,12,-1))

# List of regex patterns for matching Twitter posts mentioning the age of the user
# The patterns are built using the age expressed in digits (e.g. "22" for 22)
AGE_DIGIT_PATTERNS = [
    # Matches phrases like "ho compiuto 22 anni" (I just turned 22)
    # but not "quando ho compiuto 22 anni" (when I turned 22)
    # nor "ho compiuto 22 anni di/de" (I have 22 years of)
    r"(?<!quando\s)(?<!quando)ho\s*compiuto\s*(\d{2})\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)",
    r"\bcompio\s*(\d{2})\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)",
    # Matches phrases like "ho 22 anni" (I am 22 years old)
    # but not "da quando/non ho 22 anni" (since I am / I am not 22 years old)
    # nor "ho 22 anni di/de" (I have 22 years of)
    # nor "se ho 22 anni" (if I am 22 years old)
    r"(?<!quando\s)(?<!quando)(?<!non\s)(?<!non)(?<!se\s)(?<!se)ho\s*(\d{2})\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)",
    # Matches phrases like "faccio 22 anni" (I am turning 22 years old)
    # but not "faccio 22 anni di/de" (I have 22 years of)
    r"\bfaccio\s*(\d{2})\s*anni(?! che)(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)(?!che)",
    # Matches phrases like "spengo 22 candeline" (I am blowing 22 candles)
    r"\bspengo\s*(\d{2})\s*candeline",
    # Matches phrases like "il mio 22^ compleanno" (my 22nd birthday)
    r"il\s*mio\s*(\d{2})\^\s*comple(?:anno)?",
    # Matches phrases like "sono un 22enne" (I am a 22-year-old...)
    r"\bsono\s*una?\s*(\d{2})\s*enne",
    # Matches phrases like "i miei 22 anni" (my 22 years)
    # r"\bmiei\s*(\d{2})\s*anni",
]

YEAR_OF_BIRTH_PATTERNS = [
    # Matches sentences like "sono nato nel 1993/93/'93" (I was born in 1993)
    r"\bsono\s*nato\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bsono\s*nata\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono del 1993/93/'93" (I am from 1993)
    # r"sono\s*del\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono un 1993/93/'93" (I am a 1993)
    # r"sono\s*una?\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono della generazione 1993/93/'93" (I am generation 1993)
    r"sono\s*della\s*generazione\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono classe 1993/93/'93" (I am class 1993)
    r"sono\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"sono\s*una?\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
]

YEAR_OF_BIRTH_PATTERNS_BIO = [
    # Matches sentences like "sono nato nel 1993/93/'93" (I was born in 1993)
    r"\bsono\s*nato\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bsono\s*nata\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bnato\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bnata\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bborn\s*in\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono del 1993/93/'93" (I am from 1993)
    r"sono\s*del\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono un 1993/93/'93" (I am a 1993)
    r"sono\s*una?\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono della generazione 1993/93/'93" (I am generation 1993)
    r"sono\s*della\s*generazione\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bgenerazione\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono classe 1993/93/'93" (I am class 1993)
    r"sono\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"sono\s*una?\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bclasse\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
]

def return_full_age_char_pattern(age_char):
    """
    Returns a list of regex patterns for matching Twitter posts mentioning the age of the user.
    The patterns are built using the age_char parameter, which is a string containing the
    Italian word for the age of the user (e.g. "ventidue" for 22).
    """
    age_char_patterns = [
            # Matches phrases like "ho compiuto ventidue anni" (I just turned twenty-two)
            # but not "quando ho compiuto ventidue anni" (when I turned twenty-two)
            # nor "ho compiuto ventidue anni di/de" (I have twenty-two years of)
            r"(?<!quando\s)(?<!quando)ho\s*compiuto\s*({}).*\s*anni(?! de)(?!de)(?! di)(?!di)(?! in più)(?! in meno)".format(age_char),
            r"\bcompio\s*({}).*\s*anni(?! de)(?! di)(?!de)(?!di)".format(age_char),
            # Matches phrases like "ho ventidue anni" (I am twenty-two years old),
            # but not "a quando/non ho ventidue anni" (since I am / I am not twenty-two years old)
            # nor "ho ventidue anni di/de" (I have twenty-two years of)
            # nor "se ho ventidue anni" (if I am twenty-two years old)
            r"(?<!quando\s)(?<!quando)(?<!non\s)(?<!non)(?<!se\s)(?<!se)ho\s*({}).*\s*anni(?! de)(?! di)(?!de)(?!di)(?! in più)(?! in meno)".format(age_char),
            # Matches phrases like "faccio ventidue anni" (I am turning twenty-two years old)
            r"\bfaccio\s*({}).*\s*anni(?! de)(?! di)(?!de)(?!di)".format(age_char),
            # Matches phrases like "spengo ventidue candeline" (I am blowing twenty-two candles)
            r"\bspengo\s*({})\s*candeline".format(age_char),
            # Matches phrases like "mio ventiduesimo comple/compleanno" (my twenty-second birthday)
            r"il\s*mio\s*{}e?simo\s*comple(?:anno)?".format(age_char),
            # Matches phrases like "sono un ventiduenne" (I am twenty-two-years-old...)
            r"\bsono\s*una?\s*({})\s*e?nne".format(age_char),
            # Matches phrases like "i miei ventidue anni" (my twenty-two years)
            # r"\bmiei\s*({}).*\s*anni".format(age_char),
        ]
    return age_char_patterns

def remove_age_pattern(text):

    # search for year of birth patterns
    for pattern in YEAR_OF_BIRTH_PATTERNS:
        pattern = "(" + pattern + ")"
        newtext = re.sub(pattern, "", text, flags=re.IGNORECASE)
        if newtext != text:
            return newtext

    # search for year of birth patterns
    for pattern in AGE_DIGIT_PATTERNS:
        pattern = "(" + pattern + ")"
        newtext = re.sub(pattern, "", text, flags=re.IGNORECASE)
        if newtext != text:
            return newtext

    # check if the text contains an age expressed in characters
    if re.search(r"{}".format("|".join(AGE_CHAR_SUFFIX_SHORT)), text, flags=re.IGNORECASE):
        # check what age is expressed in the tweet and retrieve its index
        matching_age_char = re.findall(r"{}".format("|".join(AGE_CHAR_SUFFIX_LONG)), text, flags=re.IGNORECASE)[0].lower()
        matching_age_char_index = AGE_CHAR_SUFFIX_LONG.index(matching_age_char)
        # check if the age is not in a quoted text
        if not re.search(r"\".*{}.*\"".format(matching_age_char), text, flags=re.IGNORECASE) \
            and not re.search(r"\“.*{}.*\”".format(matching_age_char), text, flags=re.IGNORECASE) \
            and not re.search(r"\«.*{}.*\»".format(matching_age_char), text, flags=re.IGNORECASE):
            # check if also the full form of the age is present in the text
            if re.search(r"{}".format(AGE_CHAR[matching_age_char_index]), text, flags=re.IGNORECASE):
                patterns = return_full_age_char_pattern(AGE_CHAR[matching_age_char_index])
            else:
                patterns = return_full_age_char_pattern(AGE_CHAR_SUFFIX_LONG[matching_age_char_index])
            # search for age statements and retrieve age
            for pattern in patterns:
                pattern = "(" + pattern + ")"
                newtext = re.sub(pattern, "", text, flags=re.IGNORECASE)
                if newtext != text:
                    return newtext

    # search for year of birth patterns
    for pattern in YEAR_OF_BIRTH_PATTERNS_BIO:
        pattern = "(" + pattern + ")"
        newtext = re.sub(pattern, "", text , flags=re.IGNORECASE)
        if newtext != text:
            return newtext

    return text

In [4]:
# read users data
path_train  = os.path.join(uc_dir, 'user_age_gender_location_train_set.pkl')
path_test  = os.path.join(uc_dir, 'user_age_gender_location_test_set.pkl')
df_agl_train = pd.read_pickle(path_train)
df_agl_test = pd.read_pickle(path_test)


In [5]:
# Create a mask for entries where 'regex_type' starts with 'bio'
df_agl_train['bio_mask'] = df_agl_train['regex_type'].str.startswith('bio').astype(int)
df_agl_test['bio_mask'] = df_agl_test['regex_type'].str.startswith('bio').astype(int)

# Sort by user_id and the bio_mask to ensure bio entries come first
df_agl_train = df_agl_train.sort_values(by=['user_id', 'bio_mask'], ascending=[True, False])
df_agl_test = df_agl_test.sort_values(by=['user_id', 'bio_mask'], ascending=[True, False])

# Drop duplicates based on user_id, keeping only the first occurrence
df_agl_train = df_agl_train.drop_duplicates(subset='user_id', keep='first')
df_agl_test = df_agl_test.drop_duplicates(subset='user_id', keep='first')

# Drop the bio_mask column, as it's no longer needed
df_agl_train.drop('bio_mask', axis=1, inplace=True)
df_agl_test.drop('bio_mask', axis=1, inplace=True)


In [6]:
for df in (df_agl_train, df_agl_test):
    
    df['masked_bio'] = df['bio'].fillna('').apply(lambda x: remove_age_pattern(x) )
    df['masked_tweet'] = df['tweet'].fillna('').apply(lambda x: remove_age_pattern(x))

In [7]:
# for df_agl in (df_agl_train, df_agl_test):
#     # Iterate over each row and apply the mask_bio function
#     df_agl['masked_bio'] = df_agl['bio']
#     df_agl['masked_tweet'] = df_agl['tweet']

#     for idx, row in df_agl.iterrows():
#         if row['regex_type'].startswith('bio'):
#             df_agl.at[idx, 'masked_bio'] = mask_bio(row['bio'], row['regex_type'], row['age_raw'])

#         else: 
#             df_agl.at[idx, 'masked_tweet'] = mask_bio(row['tweet'], row['regex_type'], row['age_raw'])

In [8]:
# save
path_train  = os.path.join(uc_dir, 'user_age_gender_location_train_set_masked.pkl')
path_test  = os.path.join(uc_dir, 'user_age_gender_location_test_set_masked.pkl')
df_agl_train.to_pickle(path_train)
df_agl_test.to_pickle(path_test)

print(df_agl_train.shape)
print(df_agl_test.shape)

(19200, 40)
(1120, 40)


### Create Training Data

In [9]:
# load tweets
path = os.path.join(uc_dir, 'tweets_by_user_id_clean.pkl')
df_twt = pd.read_pickle(path)

print(df_twt.shape)

(29304512, 8)


In [10]:
# create date column
df_twt['date'] = pd.to_datetime(df_twt['created_at'].str.slice(0,10))

# last_tweet year for each user
df_last = df_twt[['user_id', 'date']].sort_values(by=['user_id', 'date'], ascending=False)
df_last = df_last.drop_duplicates(subset='user_id', keep='first')
df_last.columns = ['user_id', 'last_tweet']
df_last['last_tweet'] = df_last['last_tweet'].dt.year

# discard retweets
df_twt = df_twt[df_twt['RT']==False]
# keep only it tweets:
# df_twt = df_twt[df_twt['language']=='it']

# keep selected columns
df_twt = df_twt[[ 'user_id', 'text', 'tweet_id', 'date', 'language']]

print(df_twt.shape)

(22315330, 5)


In [12]:
# load users masked data
path_train  = os.path.join(uc_dir, 'user_age_gender_location_train_set_masked.pkl')
path_test  = os.path.join(uc_dir, 'user_age_gender_location_test_set_masked.pkl')

df_um_train = pd.read_pickle(path_train)
df_um_test = pd.read_pickle(path_test)

print(df_um_train.shape)
print(df_um_test.shape)

# merge with last tweet data
df_um_train = df_um_train.merge(df_last, on='user_id', how='left')
df_um_test = df_um_test.merge(df_last, on='user_id', how='left')

print(df_um_train.shape)
print(df_um_test.shape)


(19200, 40)
(1120, 40)
(19200, 41)
(1120, 41)


In [20]:
for df_um in (df_um_train,df_um_test):
    # specify age based on the date of the last tweet
    df_um['age'] = 0

    for idx, row in df_um.iterrows():
        if row['regex_type'].startswith('bio'):
            if row['regex_type'].endswith('birth_year'): 
                df_um.at[idx, 'age'] = row['last_tweet'] - int(row['age_raw'])
            else: 
                df_um.at[idx, 'age'] =  int(row['age_raw']) # the raw age if mentioned in the bio
                
        # for tweets       
        else:
            df_um.at[idx, 'age'] = row['age_when_tweeted'] + (row['last_tweet']-row['year_tweet'])


In [28]:
df_um_train[~ df_um_train['regex_type'].str.startswith('bio')][['age_raw', 'age', 'last_tweet', 'year_tweet']]

Unnamed: 0,age_raw,age,last_tweet,year_tweet
0,52,55,2022,2019.0
1,45,49,2023,2019.0
2,27,28,2022,2021.0
3,32,39,2022,2015.0
4,40,42,2023,2021.0
...,...,...,...,...
19194,41,41,2023,2023.0
19195,22,22,2023,2023.0
19197,40,40,2023,2023.0
19198,63,63,2023,2023.0


In [24]:
print(df_um_train.shape)
print(df_um_train['age'].isna().sum())
print(df_um_test['age'].isna().sum())

(19200, 41)
0
0


In [29]:
def transform_df(df, N=1000):
    # df bio
    df_bio = df[df['masked_tweet'].notna()]
    df_bio = df_bio[['user_id', 'masked_bio']].fillna('').drop_duplicates()
    
    # process tweets
    df_text = df[['user_id', 'text', 'masked_tweet', 'has_mask', 'date']]
    
    #  if has mask, use the masked_version
    df_text['text_masked'] = df_text.apply(lambda x: x['text'] if x['has_mask'] == 0 else x['masked_tweet'], axis=1)
    
    # keen the N most recent text_masked
    df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)
    
    # Filter out entries with rank greater than N
    df_text = df_text[df_text['rank'] <= N]
    
    df_text_grouped = df_text.groupby('user_id')['text_masked'].agg(lambda x: '\n'.join(x)).reset_index()
    df_text_grouped = df_text_grouped.rename(columns={'text_masked': 'long_text'})
    
    # Merge df_bio with df_text_grouped
    result_df = pd.merge(df_bio, df_text_grouped, on='user_id', how='right').fillna('')

    return result_df

In [30]:

# merge bio and tweets

# merge selected columns only
columns= ['user_id', 'username' ,'full_name', 'is_male', 'age' , 'masked_bio']


# for train #########################################################
df_um = df_um_train

# merge with tweets
df_ut = df_um[columns].merge(df_twt, on='user_id', how='inner')

# merge with masked tweets
df_ut = df_ut.merge(df_um[df_um['masked_tweet'].notna()][['tweet_id', 'masked_tweet']], 
                    on='tweet_id', how='left')

# flag tweets that are masked
df_ut['has_mask'] = df_ut['masked_tweet'].notna().astype(int)

# sort by date
df_ut = df_ut.sort_values(by=['user_id', 'date'], ascending=False).reset_index(drop=True)
print('Train shape:', df_ut.shape)


# transform tweets into a long text
df_utt = transform_df(df_ut)

df_uttr = df_um[['user_id', 'is_male', 'age']].merge(df_utt, on='user_id')

# save
path  = os.path.join(uc_dir, 'data_for_models_train.pkl')
df_uttr.to_pickle(path)



# for test #########################################################
df_um = df_um_test

# merge with tweets
df_ut = df_um[columns].merge(df_twt, on='user_id', how='inner')

# merge with masked tweets
df_ut = df_ut.merge(df_um[df_um['masked_tweet'].notna()][['tweet_id', 'masked_tweet']], 
                    on='tweet_id', how='left')

# flag tweets that are masked
df_ut['has_mask'] = df_ut['masked_tweet'].notna().astype(int)

# sort by date
df_ut = df_ut.sort_values(by=['user_id', 'date'], ascending=False).reset_index(drop=True)
print('Train shape:', df_ut.shape)

# transform tweets into a long text
df_utt = transform_df(df_ut)

df_utts = df_um[['user_id', 'is_male', 'age']].merge(df_utt, on='user_id')

# save
path  = os.path.join(uc_dir, 'data_for_models_test.pkl')
df_utts.to_pickle(path)


Train shape: (20784479, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text_masked'] = df_text.apply(lambda x: x['text'] if x['has_mask'] == 0 else x['masked_tweet'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)


Train shape: (1266864, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text_masked'] = df_text.apply(lambda x: x['text'] if x['has_mask'] == 0 else x['masked_tweet'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)
