In [1]:
import os 
import pandas as pd
import sqlite3  as sql
import numpy as np
import re
import json
import time
from tqdm import tqdm
from joblib import Parallel, delayed
import psutil

In [54]:
AGE_CHAR = [
    'novantanove',
    'novantotto',
    'novantasette',
    'novantasei',
    'novantacinque',
    'novantaquattro',
    'novantatre',
    'novantadue',
    'novantuno',
    'novanta',
    'ottantanove',
    'ottantotto',
    'ottantasette',
    'ottantasei',
    'ottantacinque',
    'ottantaquattro',
    'ottantatre',
    'ottantadue',
    'ottantuno',
    'ottanta',
    'settantanove',
    'settantotto',
    'settantasette',
    'settantasei',
    'settantacinque',
    'settantaquattro',
    'settantatre',
    'settantadue',
    'settantuno',
    'settanta',
    'sessantanove',
    'sessantotto',
    'sessantasette',
    'sessantasei',
    'sessantacinque',
    'sessantaquattro',
    'sessantatre',
    'sessantadue',
    'sessantuno',
    'sessanta',
    'cinquantanove',
    'cinquantotto',
    'cinquantasette',
    'cinquantasei',
    'cinquantacinque',
    'cinquantaquattro',
    'cinquantatre',
    'cinquantadue',
    'cinquantuno',
    'cinquanta',
    'quarantanove',
    'quarantotto',
    'quarantasette',
    'quarantasei',
    'quarantacinque',
    'quarantaquattro',
    'quarantatre',
    'quarantadue',
    'quarantuno',
    'quaranta',
    'trentanove',
    'trentotto',
    'trentasette',
    'trentasei',
    'trentacinque',
    'trentaquattro',
    'trentatre',
    'trentadue',
    'trentuno',
    'trenta',
    'ventinove',
    'ventotto',
    'ventisette',
    'ventisei',
    'venticinque',
    'ventiquattro',
    'ventitre',
    'ventidue',
    'ventuno',
    'venti',
    'diciannove',
    'diciotto',
    'diciassette',
    'sedici',
    'quindici',
    'quattordici',
    'tredici'
 ]
# remove last letter of each years_in_words entry, in order to match both
# the noun ('ventiquattro') and the adjective ('ventiquattrenne')
AGE_CHAR_SUFFIX_LONG = [year[:-1] for year in AGE_CHAR]
# keep only the shortest form as a first filter
AGE_CHAR_SUFFIX_SHORT = [
    'tredic',
    'quattordic',
    'quindic',
    'sedic',
    'diciasset',
    'diciott',
    'diciannov',
    'vent',
    'trent',
    'quarant',
    'cinquant',
    'sessant',
    'settant',
    'ottant',
    'novant',
]
AGE_DIGIT = list(range(99,12,-1))
# List of regex patterns for matching Twitter posts mentioning the age of the user
# The patterns are built using the age expressed in digits (e.g. '22" for 22)
AGE_DIGIT_PATTERNS = [
    # Matches phrases like 'ho compiuto 22 anni' (I just turned 22)
    # but not 'quando ho compiuto 22 anni' (when I turned 22)
    # nor 'ho compiuto 22 anni di/de' (I have 22 years of)
    r'(?<!quando\s)(?<!quando)ho\s*compiuto\s*(\d{2})\s*anni(?! de)(?! di)(?!de)(?!di)(?! in più)(?! in meno)',
    r'\bcompio\s*(\d{2})\s*anni(?! de)(?! di)(?!de)(?!di)',
    # Matches phrases like 'ho 22 anni' (I am 22 years old)
    # but not 'da quando/non ho 22 anni' (since I am / I am not 22 years old)
    # nor 'ho 22 anni di/de' (I have 22 years of)
    # nor 'se ho 22 anni' (if I am 22 years old)
    r'(?<!quando\s)(?<!quando)(?<!non\s)(?<!non)(?<!se\s)(?<!se)ho\s*(\d{2})\s*anni(?! de)(?!de)(?! di)(?!di)(?! in più)(?! in meno)',
    # Matches phrases like 'faccio 22 anni' (I am turning 22 years old)
    # but not 'faccio 22 anni di/de' (I have 22 years of)
    r'\bfaccio\s*(\d{2})\s*anni(?! de)(?! di)(?!de)(?!di)',
    # Matches phrases like 'spengo 22 candeline' (I am blowing 22 candles)
    r'\bspengo\s*(\d{2})\s*candeline',
    # Matches phrases like 'il mio 22^ compleanno' (my 22nd birthday)
    r'il\s*mio\s*(\d{2})\^\s*comple(?:anno)?',
    # Matches phrases like 'sono un 22enne' (I am a 22-year-old...)
    r'\bsono\s*una?\s*(\d{2})\s*enne',
    # Matches phrases like 'i miei 22 anni' (my 22 years)
    # r'\bmiei\s*(\d{2})\s*anni',
]

YEAR_OF_BIRTH_PATTERNS_BIO = [
    # Matches sentences like "sono nato nel 1993/93/'93" (I was born in 1993)
    r"\bsono\s*nato\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bsono\s*nata\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bnato\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bnata\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bborn\s*in\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono del 1993/93/'93" (I am from 1993)
    r"sono\s*del\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono un 1993/93/'93" (I am a 1993)
    r"sono\s*una?\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono della generazione 1993/93/'93" (I am generation 1993)
    r"sono\s*della\s*generazione\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bgenerazione\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono classe 1993/93/'93" (I am class 1993)
    r"sono\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"sono\s*una?\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bclasse\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
]

In [55]:
def return_full_age_char_pattern(age_char):
    """
    Returns a list of regex patterns for matching Twitter posts mentioning the age of the user.
    The patterns are built using the age_char parameter, which is a string containing the
    Italian word for the age of the user (e.g. "ventidue" for 22).
    """
    age_char_patterns = [
            # Matches phrases like "ho compiuto ventidue anni" (I just turned twenty-two)
            # but not "quando ho compiuto ventidue anni" (when I turned twenty-two)
            # nor "ho compiuto ventidue anni di/de" (I have twenty-two years of)
            r"(?<!quando\s)(?<!quando)ho\s*compiuto\s*({}).*\s*anni(?! de)(?!de)(?! di)(?!di)(?! in più)(?! in meno)".format(age_char),
            r"\bcompio\s*({}).*\s*anni(?! de)(?! di)(?!de)(?!di)".format(age_char),
            # Matches phrases like "ho ventidue anni" (I am twenty-two years old),
            # but not "a quando/non ho ventidue anni" (since I am / I am not twenty-two years old)
            # nor "ho ventidue anni di/de" (I have twenty-two years of)
            # nor "se ho ventidue anni" (if I am twenty-two years old)
            r"(?<!quando\s)(?<!quando)(?<!non\s)(?<!non)(?<!se\s)(?<!se)ho\s*({}).*\s*anni(?! de)(?! di)(?!de)(?!di)(?! in più)(?! in meno)".format(age_char),
            # Matches phrases like "faccio ventidue anni" (I am turning twenty-two years old)
            r"\bfaccio\s*({}).*\s*anni(?! de)(?! di)(?!de)(?!di)".format(age_char),
            # Matches phrases like "spengo ventidue candeline" (I am blowing twenty-two candles)
            r"\bspengo\s*({})\s*candeline".format(age_char),
            # Matches phrases like "mio ventiduesimo comple/compleanno" (my twenty-second birthday)
            r"il\s*mio\s*{}e?simo\s*comple(?:anno)?".format(age_char),
            # Matches phrases like "sono un ventiduenne" (I am twenty-two-years-old...)
            r"\bsono\s*una?\s*({})\s*e?nne".format(age_char),
            # Matches phrases like "i miei ventidue anni" (my twenty-two years)
            # r"\bmiei\s*({}).*\s*anni".format(age_char),
        ]
    return age_char_patterns

In [56]:
def bio_user_age(text, user_id= None):
    """
    Returns the age of the user who posted the bio, if the bio contains a mention of the user's age.
    TODO: the age returned by this function should be compared with the creation date of the bio.
    """
    try:
        if len(text) > 0:
            # check if the bio contains a double digit number, but not in a quoted text
            if re.search(r"\d{2}", text):
                if not re.search(r"\".*\d{2}.*\"", text) \
                    and not re.search(r"\".*\d{2}.*\"", text) \
                    and not re.search(r"\«.*\d{2}.*\»", text):

                    # search for age patterns
                    for i, pattern in enumerate(AGE_DIGIT_PATTERNS):
                        matches = re.findall(pattern, text, flags=re.IGNORECASE)
                        if matches:
                            return {'user_id': str(user_id),  "bio": text, 'status': 'processed',
                                    "regex_type": "bio_age_digit", "regex_idx": i, "age": int(matches[0])}

                    # search for year of birth patterns
                    for i, pattern in enumerate(YEAR_OF_BIRTH_PATTERNS):
                        matches = re.findall(pattern, text, flags=re.IGNORECASE)
                        if matches:
                            birth_year = re.sub('[^0-9]','', matches[0])
                            # if only a double digit year is retrieved, then attach 19 or 20 to it
                            if len(birth_year) == 2:
                                if int(birth_year) < 20:
                                    birth_year = "20" + birth_year
                                else:
                                    birth_year = "19" + birth_year
                            return {'user_id': str(user_id),  "bio": text, 'status': 'processed',
                                    "regex_type": "bio_birth_year", "regex_idx": i, "age": int(birth_year)}

            # check if the bio contains an age expressed in characters
            if re.search(r"{}".format("|".join(AGE_CHAR_SUFFIX_SHORT)), text, flags=re.IGNORECASE):
                # check what age is expressed in the bio and retrieve its index
                matching_age_char = re.findall(r"{}".format("|".join(AGE_CHAR_SUFFIX_LONG)), text, flags=re.IGNORECASE)[0].lower()
                matching_age_char_index = AGE_CHAR_SUFFIX_LONG.index(matching_age_char)
                # check if the age is not in a quoted text
                if not re.search(r"\".*{}.*\"".format(matching_age_char), text, flags=re.IGNORECASE) \
                    and not re.search(r"\".*{}.*\"".format(matching_age_char), text, flags=re.IGNORECASE) \
                    and not re.search(r"\«.*{}.*\»".format(matching_age_char), text, flags=re.IGNORECASE):
                    # check if also the full form of the age is present in the text
                    if re.search(r"{}".format(AGE_CHAR[matching_age_char_index]), text, flags=re.IGNORECASE):
                        patterns = return_full_age_char_pattern(AGE_CHAR[matching_age_char_index])
                    else:
                        patterns = return_full_age_char_pattern(AGE_CHAR_SUFFIX_LONG[matching_age_char_index])
                    # search for age statements and retrieve age
                    for i, pattern in enumerate(patterns):
                        matches = re.findall(pattern, text, flags=re.IGNORECASE)
                        if matches:
                            return {'user_id': str(user_id),  "bio": text, 'status': 'processed',
                                    "regex_type": "bio_age_chars", "regex_idx": i, "age": int(AGE_DIGIT[matching_age_char_index])}
    except Exception as e:
        return {'user_id': str(user_id),  "bio": text, 'status': 'bio_raised_exception'}
    
    return


### Process the database

In [60]:
work_dir = '/g100_work/IscrC_mental'

wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(wdata_dir, 'user_classification')

    
# connection to database    
dbase_path = os.path.join(work_dir, 'data', 'database', 'MENTALISM.db')
connection = sql.connect(dbase_path)
cursor = connection.cursor()

In [61]:
# Query to get all table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Print the table names
for table in tables:
    print(table[0])

tweets
user
user_geocoded_old
user_geocoded


In [62]:
def process_subchunk(subchunk):
    results = []
    for _, row in subchunk.iterrows():

        if not row['bio'] is None :
            result = bio_user_age(
                user_id=row['user_id'],
                text=row['bio'])

            results.append(result)
        
    return results

In [63]:
num_workers=16
# Count the total number of unique bio_ids

chunk_size=10000 * num_workers
total_rows = pd.read_sql('SELECT COUNT(*) FROM user', connection).iloc[0, 0]
num_chunks = (total_rows // chunk_size) + (1 if total_rows % chunk_size else 0)

all_results = []
chunks = pd.read_sql('SELECT * FROM user', connection, chunksize=chunk_size)

for chunk in tqdm(chunks, total=num_chunks):
    # Split the chunk into smaller parts for each worker
    num_splits = min(num_workers, len(chunk))
    subchunks = np.array_split(chunk, num_splits)
    
    # Process each part in parallel
    results = Parallel(n_jobs=num_workers)(delayed(process_subchunk)(subchunk) for subchunk in subchunks)
    
    # Flatten the results and append to all_results
    flat_results = [item for sublist in results for item in sublist]
    all_results.extend(flat_results)
    


100%|██████████| 14/14 [02:15<00:00,  9.69s/it]


In [64]:
# save as a pandas df:
filtered_data = [item for item in all_results if item is not None]
df_bioage = pd.DataFrame(filtered_data)
df_bioage = df_bioage[df_bioage['status']=='processed']
df_bioage = df_bioage.rename(columns={'age': 'age_raw'})

In [81]:
# print some info
print(df_bioage.shape)

print(df_bioage.age_raw.value_counts().sort_index()[-30:])

#for i in df_bioage[(df_bioage['regex_idx']==0.0) & (df_bioage['regex_type']=='bio_birth_year')].bio[:30]:
for i in df_bioage[(df_bioage['age_raw']==10)].bio[:30]:
    print(i)
    print('----------')

(2652, 6)
1990    34
1991    30
1992    30
1993    16
1994    22
1995    24
1996    24
1997    24
1998    26
1999    26
2000    15
2001    14
2002    16
2003     7
2004    10
2005     3
2006     8
2007    14
2008     7
2009    12
2010    10
2011    12
2012    11
2013    14
2014     8
2015     7
2016     8
2017     5
2018     4
2019     4
Name: age_raw, dtype: int64
ho 10 anni e mi piace fare le foto anche se la mia passione e cantare
----------
Io sono nir abito a .... faccio basket i miei idoli sono benji e fede ho 10 anni seguitemi sui instagram
INSTAGRAM - MINNIE_NIR_2006💕
----------
Mi chiamo Alessandra sono napoletana ho 10 anni e vado in 1 media ,abito a Carpi e sono una Youtubers e mi chiamo Alessandra Channel iscrivetevi ...A presto....
----------
ho 10 anni e sono nato il 27 01
----------
mi piace molto il calcio tifo napoli il mio idolo e lavezzi mi piace la pizza e ho 10 anni e sono bello
----------
ho 10 anni
----------
Sono una bambina ho 10anni sono dolcissim a ma è un a 

In [83]:
# clean from extreme ages
df_bioage_clean = df_bioage[(df_bioage['age_raw']<=2007) & (df_bioage['age_raw']>=10)]

In [84]:
# append to tweet-age data
#path = os.path.join(uc_dir, 'user_age.pkl')
#df_tweetage = pd.read_pickle(path)

#df_all = df_tweetage.append(df_bioage_clean)

# save
#path = os.path.join(uc_dir, 'user_age_allsources.pkl')
#df_all.to_pickle(path)

In [85]:
# save to pickle
path = os.path.join(uc_dir, 'user_age_bio.pkl')
df_bioage_clean.to_pickle(path)