In [1]:
import os 
import pandas as pd
import sqlite3  as sql
import numpy as np
import re
import json
import time
from tqdm import tqdm
from joblib import Parallel, delayed
import psutil

In [2]:
AGE_CHAR = [
    'novantanove',
    'novantotto',
    'novantasette',
    'novantasei',
    'novantacinque',
    'novantaquattro',
    'novantatre',
    'novantadue',
    'novantuno',
    'novanta',
    'ottantanove',
    'ottantotto',
    'ottantasette',
    'ottantasei',
    'ottantacinque',
    'ottantaquattro',
    'ottantatre',
    'ottantadue',
    'ottantuno',
    'ottanta',
    'settantanove',
    'settantotto',
    'settantasette',
    'settantasei',
    'settantacinque',
    'settantaquattro',
    'settantatre',
    'settantadue',
    'settantuno',
    'settanta',
    'sessantanove',
    'sessantotto',
    'sessantasette',
    'sessantasei',
    'sessantacinque',
    'sessantaquattro',
    'sessantatre',
    'sessantadue',
    'sessantuno',
    'sessanta',
    'cinquantanove',
    'cinquantotto',
    'cinquantasette',
    'cinquantasei',
    'cinquantacinque',
    'cinquantaquattro',
    'cinquantatre',
    'cinquantadue',
    'cinquantuno',
    'cinquanta',
    'quarantanove',
    'quarantotto',
    'quarantasette',
    'quarantasei',
    'quarantacinque',
    'quarantaquattro',
    'quarantatre',
    'quarantadue',
    'quarantuno',
    'quaranta',
    'trentanove',
    'trentotto',
    'trentasette',
    'trentasei',
    'trentacinque',
    'trentaquattro',
    'trentatre',
    'trentadue',
    'trentuno',
    'trenta',
    'ventinove',
    'ventotto',
    'ventisette',
    'ventisei',
    'venticinque',
    'ventiquattro',
    'ventitre',
    'ventidue',
    'ventuno',
    'venti',
    'diciannove',
    'diciotto',
    'diciassette',
    'sedici',
    'quindici',
    'quattordici',
    'tredici'
 ]
# remove last letter of each years_in_words entry, in order to match both
# the noun ('ventiquattro') and the adjective ('ventiquattrenne')
AGE_CHAR_SUFFIX_LONG = [year[:-1] for year in AGE_CHAR]
# keep only the shortest form as a first filter
AGE_CHAR_SUFFIX_SHORT = [
    'tredic',
    'quattordic',
    'quindic',
    'sedic',
    'diciasset',
    'diciott',
    'diciannov',
    'vent',
    'trent',
    'quarant',
    'cinquant',
    'sessant',
    'settant',
    'ottant',
    'novant',
]
AGE_DIGIT = list(range(99,12,-1))
# List of regex patterns for matching Twitter posts mentioning the age of the user
# The patterns are built using the age expressed in digits (e.g. '22" for 22)
AGE_DIGIT_PATTERNS = [
    # Matches phrases like "ho compiuto 22 anni" (I just turned 22)
    # but not "quando ho compiuto 22 anni" (when I turned 22)
    # nor "ho compiuto 22 anni di/de" (I have 22 years of)
    r"(?<!quando\s)(?<!quando)ho\s*compiuto\s*(\d{2})\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)",
    r"\bcompio\s*(\d{2})\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)",
    # Matches phrases like "ho 22 anni" (I am 22 years old)
    # but not "da quando/non ho 22 anni" (since I am / I am not 22 years old)
    # nor "ho 22 anni di/de" (I have 22 years of)
    # nor "se ho 22 anni" (if I am 22 years old)
    r"(?<!quando\s)(?<!quando)(?<!non\s)(?<!non)(?<!se\s)(?<!se)ho\s*(\d{2})\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)",
    # Matches phrases like "faccio 22 anni" (I am turning 22 years old)
    # but not "faccio 22 anni di/de" (I have 22 years of)
    r"\bfaccio\s*(\d{2})\s*anni(?! che)(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)(?!che)",
    # Matches phrases like "spengo 22 candeline" (I am blowing 22 candles)
    r"\bspengo\s*(\d{2})\s*candeline",
    # Matches phrases like "il mio 22^ compleanno" (my 22nd birthday)
    r"il\s*mio\s*(\d{2})\^\s*comple(?:anno)?",
    # Matches phrases like "sono un 22enne" (I am a 22-year-old...)
    r"\bsono\s*una?\s*(\d{2})\s*enne",
    # Matches phrases like "i miei 22 anni" (my 22 years)
    # r"\bmiei\s*(\d{2})\s*anni",
]


YEAR_OF_BIRTH_PATTERNS = [
    # Matches sentences like "sono nato nel 1993/93/’93" (I was born in 1993)
    r"\bsono\s*nato\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"\bsono\s*nata\s*nel\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono del 1993/93/’93" (I am from 1993)
    # r"sono\s*del\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono un 1993/93/’93" (I am a 1993)
    # r"sono\s*una?\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono della generazione 1993/93/’93" (I am generation 1993)
    r"sono\s*della\s*generazione\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    # Matches sentences like "sono classe 1993/93/’93" (I am class 1993)
    r"sono\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
    r"sono\s*una?\s*classe\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
]

In [3]:
def return_full_age_char_pattern(age_char):
    """
    Returns a list of regex patterns for matching Twitter posts mentioning the age of the user.
    The patterns are built using the age_char parameter, which is a string containing the
    Italian word for the age of the user (e.g. "ventidue" for 22).
    """
    age_char_patterns = [
            # Matches phrases like "ho compiuto ventidue anni" (I just turned twenty-two)
            # but not "quando ho compiuto ventidue anni" (when I turned twenty-two)
            # nor "ho compiuto ventidue anni di/de" (I have twenty-two years of)
            r"(?<!quando\s)(?<!quando)ho\s*compiuto\s*({}).*\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)".format(age_char),
            r"\bcompio\s*({}).*\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)".format(age_char),
            # Matches phrases like "ho ventidue anni" (I am twenty-two years old),
            # but not "a quando/non ho ventidue anni" (since I am / I am not twenty-two years old)
            # nor "ho ventidue anni di/de" (I have twenty-two years of)
            # nor "se ho ventidue anni" (if I am twenty-two years old)
            r"(?<!quando\s)(?<!quando)(?<!non\s)(?<!non)(?<!se\s)(?<!se)ho\s*({}).*\s*anni(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)".format(age_char),
            # Matches phrases like "faccio ventidue anni" (I am turning twenty-two years old)
            r"\bfaccio\s*({}).*\s*anni(?! che)(?! su)(?! più)(?! da)(?! de)(?! di)(?!de)(?!di)(?!su)(?!più)(?!da)(?! in più)(?! in meno)(?!che)".format(age_char),
            # Matches phrases like "spengo ventidue candeline" (I am blowing twenty-two candles)
            r"\bspengo\s*({})\s*candeline".format(age_char),
            # Matches phrases like "mio ventiduesimo comple/compleanno" (my twenty-second birthday)
            r"il\s*mio\s*{}e?simo\s*comple(?:anno)?(?! su)(?! più)(?! da)(?! di)(?!su)(?!più)(?!da)(?!di)".format(age_char),
            # Matches phrases like "sono un ventiduenne" (I am twenty-two-years-old...)
            r"\bsono\s*una?\s*({})\s*e?nne".format(age_char),
            # Matches phrases like "i miei ventidue anni" (my twenty-two years)
            # r"\bmiei\s*({}).*\s*anni".format(age_char),
        ]
    return age_char_patterns

In [4]:
def tweet_user_age(text, tweet_id = None, user_id= None ,created_at = None):
    """
    Returns the age of the user who posted the tweet, if the tweet contains a mention of the user's age.
    TODO: the age returned by this function should be compared with the creation date of the tweet.
    """
    try:
        if len(text) > 0:
            # check if the tweet contains a double digit number, but not in a quoted text
            if re.search(r"\d{2}", text):
                if not re.search(r"\".*\d{2}.*\"", text) \
                    and not re.search(r"\".*\d{2}.*\"", text) \
                    and not re.search(r"\«.*\d{2}.*\»", text):

                    # search for age patterns
                    for i, pattern in enumerate(AGE_DIGIT_PATTERNS):
                        matches = re.findall(pattern, text, flags=re.IGNORECASE)
                        if matches:
                            return {"tweet_id": str(tweet_id), 'user_id': str(user_id), 'created_at': created_at, 
                                    "tweet": text, 'status': 'processed', "regex_type": "age_digit",
                                    "regex_idx": i, "age": int(matches[0])}

                    # search for year of birth patterns
                    for i, pattern in enumerate(YEAR_OF_BIRTH_PATTERNS):
                        matches = re.findall(pattern, text, flags=re.IGNORECASE)
                        if matches:
                            birth_year = re.sub('[^0-9]','', matches[0])
                            # if only a double digit year is retrieved, then attach 19 or 20 to it
                            if len(birth_year) == 2:
                                if int(birth_year) < 20:
                                    birth_year = "20" + birth_year
                                else:
                                    birth_year = "19" + birth_year
                            return {"tweet_id": str(tweet_id), 'user_id': str(user_id), 'created_at': created_at,
                                    "tweet": text, 'status': 'processed',
                                    "regex_type": "birth_year", "regex_idx": i, "age": int(birth_year)}

            # check if the tweet contains an age expressed in characters
            if re.search(r"{}".format("|".join(AGE_CHAR_SUFFIX_SHORT)), text, flags=re.IGNORECASE):
                # check what age is expressed in the tweet and retrieve its index
                matching_age_char = re.findall(r"{}".format("|".join(AGE_CHAR_SUFFIX_LONG)), text, flags=re.IGNORECASE)[0].lower()
                matching_age_char_index = AGE_CHAR_SUFFIX_LONG.index(matching_age_char)
                # check if the age is not in a quoted text
                if not re.search(r"\".*{}.*\"".format(matching_age_char), text, flags=re.IGNORECASE) \
                    and not re.search(r"\".*{}.*\"".format(matching_age_char), text, flags=re.IGNORECASE) \
                    and not re.search(r"\«.*{}.*\»".format(matching_age_char), text, flags=re.IGNORECASE):
                    # check if also the full form of the age is present in the text
                    if re.search(r"{}".format(AGE_CHAR[matching_age_char_index]), text, flags=re.IGNORECASE):
                        patterns = return_full_age_char_pattern(AGE_CHAR[matching_age_char_index])
                    else:
                        patterns = return_full_age_char_pattern(AGE_CHAR_SUFFIX_LONG[matching_age_char_index])
                    # search for age statements and retrieve age
                    for i, pattern in enumerate(patterns):
                        matches = re.findall(pattern, text, flags=re.IGNORECASE)
                        if matches:
                            return {"tweet_id": str(tweet_id) , 'user_id': str(user_id), 'created_at': created_at,
                                    "tweet": text, 'status': 'processed', "regex_type": "age_chars", 
                                    "regex_idx": i, "age": int(AGE_DIGIT[matching_age_char_index])}
    except Exception as e:
        return {"tweet_id": str(tweet_id) , 'user_id': str(user_id), "tweet": text, 'status': 'raised_exception'}
    
    return


### Process the database

In [5]:
home_dir = '/g100/home/userexternal/mhabibi0/'
work_dir = '/g100_work/IscrC_mental'

#data_dir = os.path.join(home_dir, 'Data')
data_dir = os.path.join(work_dir, 'data')

# # output directory
# output_dir = os.path.join(data_dir, 'tweets_with_age_v2')
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)
    
# connection to database    
dbase_path = os.path.join(work_dir, 'data', 'database', 'MENTALISM.db')
connection = sql.connect(dbase_path)
cursor = connection.cursor()

In [11]:
def process_subchunk(subchunk):
    results = []
    # try:
    for _, row in subchunk.iterrows():

        if not row['text'] is None :
            if not row['text'].startswith("RT @"):
                result = tweet_user_age(
                    tweet_id=row['tweet_id'],
                    user_id=row['user_id'],
                    text=row['text'],
                    created_at=row['created_at'])

                results.append(result)
    # except Exception:
    #         pass
        
    return results

In [12]:
num_workers=16
# Count the total number of unique tweet_ids

chunk_size=10000 * num_workers
total_rows = pd.read_sql('SELECT COUNT(*) FROM tweets', connection).iloc[0, 0]
num_chunks = (total_rows // chunk_size) + (1 if total_rows % chunk_size else 0)

all_results = []
chunks = pd.read_sql('SELECT * FROM tweets', connection, chunksize=chunk_size)

for chunk in tqdm(chunks, total=num_chunks):
    # Split the chunk into smaller parts for each worker
    num_splits = min(num_workers, len(chunk))
    subchunks = np.array_split(chunk, num_splits)
    
    # Process each part in parallel
    results = Parallel(n_jobs=num_workers)(delayed(process_subchunk)(subchunk) for subchunk in subchunks)
    
    # Flatten the results and append to all_results
    flat_results = [item for sublist in results for item in sublist]
    all_results.extend(flat_results)
    


100%|██████████| 5193/5193 [2:22:03<00:00,  1.64s/it]  


In [13]:
import pickle 

path = os.path.join(home_dir, 'Data', 'list_tweets_w_age.pkl')
with open(path, 'wb') as file:
    pickle.dump(all_results, file)

In [20]:
uc_dir = os.path.join(data_dir, 'user_classification')
path = os.path.join(uc_dir, 'list_tweets_w_age.pkl')
with open(path, 'wb') as file:
    pickle.dump(all_results, file)

### Approach 2 (Slower)

In [None]:
# def process_chunk(chunk):
#     # Filter out rows that start with "RT @"
#     chunk.dropna(inplace=True)
#     filtered_chunk = chunk[~chunk['text'].str.startswith("RT @")]
    
#     # Convert the filtered chunk to a list of dictionaries
#     dict_list = filtered_chunk.to_dict(orient='records')
    
#     results = Parallel(n_jobs=-1)(delayed(lambda data: tweet_user_age(**data))(data) for data in dict_list)
#     return results

In [None]:
# num_workers=16
# chunk_size=10000 * num_workers
# total_rows = pd.read_sql('SELECT COUNT(tweet_id) FROM tweets', connection).iloc[0, 0]
# num_chunks = (total_rows // chunk_size) + (1 if total_rows % chunk_size else 0)

# all_results = []
# chunks = pd.read_sql('SELECT tweet_id, user_id, text, created_at FROM tweets', connection, chunksize=chunk_size)
# for chunk in tqdm(chunks, total=num_chunks):

#     results = process_chunk(chunk)
#     results_filtered = [result for result in results if result is not None]
#     all_results.extend(results_filtered)

In [None]:
# import pickle 

# path = os.path.join(home_dir, 'Data', 'list_tweets_w_age.pkl')
# with open(path, 'wb') as file:
#     pickle.dump(all_results, file)