In [1]:
import sqlite3
import pandas as pd
from tqdm import tqdm 
tqdm.pandas()
import re

In [3]:
#load dutch data for users
db = "/g100_scratch/userexternal/pbose000/nl/data/raw/twitter/twitter_nl.db"
conn = sqlite3.connect(db)
users = pd.read_sql_query("SELECT id,name,screen_name,description,gemeente_code,gemeente_name FROM users",conn)

In [4]:
users = users[users.description!=''].copy()

In [5]:
df_all = pd.DataFrame()
for regex in ["geb[\. ].*?([12][09][0-9][0-9])",
             "geboren.+?([12][09][0-9][0-9])",
             "geboren op.+?([12][09][0-9][0-9])",
             "verjaardag.+?([12][09][0-9][0-9])",
             "verjaardag.+?([12][09][0-9][0-9])",
             "([12][09][0-9][0-9]).*?geb[\. ]",
             "([12][09][0-9][0-9]).*?geboren",
             "\*.*?([12][09][0-9][0-9])",
             "([1-9][0-9]) jaar oud"]:
    
    d = users.description.str.extract(regex)
    df = users.assign(age = d.astype("Int64"))
    if regex != "([1-9][0-9]) jaar oud":
        df.age = df.age
    
    df_all= pd.concat([df_all,df.loc[df.age.notnull()]])

df_all = df_all.drop_duplicates("id")

# try tweet matching

In [6]:
AGE_DIGIT = list(range(99, 11, -1))

# List of regex patterns for matching Twitter posts mentioning the age of the user
# The patterns are built using the age expressed in digits (e.g., "22" for 22)
AGE_DIGIT_PATTERNS = [
    # Matches phrases like "ik ben 22 jaar geworden" (I just turned 22)
    # but not "toen ik 22 werd" (when I turned 22)
    # nor "ik ben 22 jaar geworden van" (I have 22 years of)
    r"(?<!toen\s)(?<!toen)ik\s*ben\s*(\d{2})\s*jaar(?! op)(?! meer)(?! van)(?! de)(?! in)(?!op)(?!meer)(?!van)(?! op de)(?! op het)(?! jonger)(?! ouder)",
    r"\bik\s*word\s*(\d{2})\s*jaar(?! op)(?! meer)(?! van)(?! de)(?! in)(?!op)(?!meer)(?!van)(?! op de)(?! op het)(?! ouder)(?! jonger)",
    # Matches phrases like "ik word 22 jaar" (I am turning 22 years old)
    r"\bik\s*word\s*(\d{2})\s*jaar(?! dan)(?! op)(?! meer)(?! van)(?! de)(?! in)(?!meer)(?!dan)",
    # Matches phrases like "mijn 22 verjaardag" (my 22nd birthday)
    r"mijn\s*(\d{2})\s*verjaardag",
    # Matches phrases like "ik ben een 22-jarige" (I am a 22-year-old...)
    r"\bik\s*ben\s*een?\s*(\d{2})\s*jarige"
]

YEAR_OF_BIRTH_PATTERNS = [
    # Matches sentences like "ik ben geboren in 1993/93/'93" (I was born in 1993)
    r"\bik\s*ben\s*geboren\s*in\s*(20[0-1][0-9]|19[0-9][0-9]|\D\d{2}\s|\D\d{2}$)",
]



def tweet_user_age(tweet):
    """
    Returns the age of the user who posted the tweet, if the tweet contains a mention of the user's age.
    TODO: the age returned by this function should be compared with the creation date of the tweet.
    """
    if len(tweet) > 0:
        # check if the tweet contains a double digit number, but not in a quoted text
        if re.search(r"\d{2}", tweet):
            if not re.search(r"\".*\d{2}.*\"", tweet) \
                and not re.search(r"\“.*\d{2}.*\”", tweet) \
                and not re.search(r"\«.*\d{2}.*\»", tweet):

                # search for age patterns
                for i, pattern in enumerate(AGE_DIGIT_PATTERNS):
                    matches = re.findall(pattern, tweet, flags=re.IGNORECASE)
                    if matches:
                        return {"tweet": tweet, "regex_type": "age_digit", "regex_idx": i, "age": int(matches[0])}

                # search for year of birth patterns
                for i, pattern in enumerate(YEAR_OF_BIRTH_PATTERNS):
                    matches = re.findall(pattern, tweet, flags=re.IGNORECASE)
                    if matches:
                        birth_year = re.sub('[^0-9]','', matches[0])
                        # if only a double digit year is retrieved, then attach 19 or 20 to it
                        if len(birth_year) == 2:
                            if int(birth_year) < 20:
                                birth_year = "20" + birth_year
                            else:
                                birth_year = "19" + birth_year
                        return {"tweet": tweet, "regex_type": "birth_year", "regex_idx": i, "age": int(birth_year)}

    return {"regex_type": None, "age": None}


In [7]:
#get length of db
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM tweets;")
n = cur.fetchone()[0]

In [8]:
chunksize = 10000000
full_df = pd.DataFrame()

#loop over db and extract ages
for x in range(0,n,chunksize):
    print(f'Percent {int(x/n*100)}:')
    tweets = pd.read_sql_query(f"SELECT user_id,id,created_at,full_text FROM tweets LIMIT {chunksize} OFFSET {x}",conn)
    ages = pd.DataFrame.from_records(tweets['full_text'].progress_apply(tweet_user_age))
    tweets = tweets.merge(ages,left_index=True,right_index=True)
    tweets = tweets[tweets.age.notna()]
    tweets = tweets[-tweets.full_text.str.startswith("RT @")]
    full_df = pd.concat([full_df,tweets[['user_id','id','created_at','full_text','regex_type','age']]])

Percent 0:


100%|██████████| 10000000/10000000 [01:20<00:00, 124434.97it/s]


Percent 9:


100%|██████████| 10000000/10000000 [01:20<00:00, 123892.84it/s]


Percent 19:


100%|██████████| 10000000/10000000 [01:17<00:00, 128825.09it/s]


Percent 29:


100%|██████████| 10000000/10000000 [01:15<00:00, 133104.64it/s]


Percent 38:


100%|██████████| 10000000/10000000 [01:19<00:00, 126207.99it/s]


Percent 48:


100%|██████████| 10000000/10000000 [01:18<00:00, 126729.30it/s]


Percent 58:


100%|██████████| 10000000/10000000 [01:21<00:00, 123231.45it/s]


Percent 67:


100%|██████████| 10000000/10000000 [01:15<00:00, 132566.50it/s]


Percent 77:


100%|██████████| 10000000/10000000 [01:15<00:00, 133251.86it/s]


Percent 87:


100%|██████████| 10000000/10000000 [01:15<00:00, 132200.63it/s]


Percent 96:


100%|██████████| 3225667/3225667 [00:25<00:00, 128340.81it/s]


In [14]:
full_df['date'] = pd.to_datetime(full_df['created_at'])
full_df['year'] = full_df['date'].dt.year

In [21]:
full_df.loc[full_df.regex_type=='age_digit','birthyear'] = full_df['year']-full_df['age']
full_df.loc[full_df.regex_type=='birth_year','birthyear'] = full_df['age']
full_df = full_df[-full_df.duplicated('user_id',keep=False)]

In [43]:
#find the latest tweet of the user
users = full_df.user_id.tolist()
bio_users = df_all.id.tolist()
all_users = users+bio_users

In [47]:
tweets = pd.read_sql_query(f"SELECT user_id,id,created_at,full_text FROM tweets WHERE user_id IN ({','.join([str(x) for x in all_users])})",conn)
tweets['date'] = pd.to_datetime(tweets['created_at'])

In [70]:
latest_tweets = tweets.groupby('user_id').agg({'date':max}).reset_index()

In [71]:
latest_tweets['year_last_tweet'] = latest_tweets.date.dt.year

In [73]:
full_df = full_df.merge(latest_tweets[['user_id','year_last_tweet']],on='user_id',how='inner')

In [75]:
full_df['age']=full_df['year_last_tweet']-full_df['birthyear']

In [86]:
df_all = df_all.merge(latest_tweets[['user_id','year_last_tweet']],left_on='id',right_on='user_id',how='inner')

In [89]:
df_all.loc[df_all.age<1000,'age_correct'] = df_all.age
df_all.loc[df_all.age>1000,'age_correct'] = df_all.year_last_tweet - df_all.age
df_all['age'] = df_all.age_correct
df_all['birthyear'] = df_all['year_last_tweet']- df_all['age']

In [94]:
df_age = pd.DataFrame()
df_age = pd.concat([df_age,df_all[['user_id','age','birthyear']]])
df_age = pd.concat([df_age,full_df[['user_id','age','birthyear']]])

In [95]:
df_age.to_pickle("/g100_scratch/userexternal/pbose000/age_nl.pkl")

In [11]:
#populate the user ids with their name, location and description
user_info = pd.read_sql_query(f"SELECT id,name,description,gemeente_code,gemeente_name FROM users WHERE id IN ({','.join([str(x) for x in df_age.user_id.tolist()])})",conn)

In [12]:
df_age = df_age.merge(user_info,left_on='user_id',right_on='id')
del df_age['id']

In [13]:
import gender_guesser.detector as gender
gender_detector = gender.Detector()
def detect_gender(name):
    firstname = name.split()[0].capitalize()
    g = gender_detector.get_gender(firstname)
    g = g.replace("mostly_", "")
    g = g.replace("andy", "unknown")
    return g
df_age['gender'] = df_age.name.apply(detect_gender)

In [17]:
df_age['is_male'] = df_age.gender.apply(lambda x: 1 if x=='male' else 0)
df_age=df_age[df_age['gender']!='unknown']
df_age['is_male'] = df_age['is_male'].astype(bool)
del df_age['gender']

In [23]:
#populate last 100 tweets of each user
tw_age = pd.read_sql_query(f"SELECT user_id, full_text, created_at FROM tweets WHERE user_id IN ({','.join([str(x) for x in df_age.user_id.tolist()])});",conn)

In [26]:
tw_age['date'] = pd.to_datetime(tw_age['created_at'])

In [28]:
tw_age['RT'] = tw_age['full_text'].str.startswith('RT @')
tw_age = tw_age[tw_age['RT']==False].copy()

In [30]:
def mask_bio(text):
    for regex in ["geb[\. ].*?([12][09][0-9][0-9])",
             "geboren.+?([12][09][0-9][0-9])",
             "geboren op.+?([12][09][0-9][0-9])",
             "verjaardag.+?([12][09][0-9][0-9])",
             "verjaardag.+?([12][09][0-9][0-9])",
             "([12][09][0-9][0-9]).*?geb[\. ]",
             "([12][09][0-9][0-9]).*?geboren",
             "\*.*?([12][09][0-9][0-9])",
             "([1-9][0-9]) jaar oud"]:
        text = re.sub(regex,'',text)
    return text

In [42]:
def mask_tweet(tweet):
    if len(tweet) > 0:
        # check if the tweet contains a double digit number, but not in a quoted text
        if re.search(r"\d{2}", tweet):
            if not re.search(r"\".*\d{2}.*\"", tweet) \
                and not re.search(r"\“.*\d{2}.*\”", tweet) \
                and not re.search(r"\«.*\d{2}.*\»", tweet):

                # search for age patterns
                for i, pattern in enumerate(AGE_DIGIT_PATTERNS):
                    tweet = re.sub(pattern,'',tweet, flags=re.IGNORECASE)

                # search for year of birth patterns
                for i, pattern in enumerate(YEAR_OF_BIRTH_PATTERNS):
                    tweet = re.sub(pattern,'',tweet, flags=re.IGNORECASE)   
    return tweet


In [33]:
df_age['masked_bio'] = df_age.description.apply(mask_bio)

In [43]:
tw_age['masked_tweet'] = tw_age.full_text.apply(mask_tweet)

In [24]:
class preprocessor():
    """
    Pre-processor for tweets. Cleans mentions,
    urls, emojis, hashtags and unix characters.
    """
    def __init__(self,mention=True,url=True,unix=True,emoji=True,hashtag=False):
        self.mention = mention
        self.url = url
        self.unix = unix
        self.emoji = emoji
        self.hashtag = hashtag

    def remove_mentions(self,tweet):
        # Remove @-mentions using regular expression
        cleaned_tweet = re.sub(r'@\w+', '', tweet).strip().replace('  ',' ')
        return cleaned_tweet
    
    def remove_hashtags(self,tweet):
        # Remove hashtags using regular expression
        cleaned_tweet = re.sub(r'#\w+', '', tweet)
        return cleaned_tweet
    
    def remove_unix(self,tweet):
        # Remove unix characters using regular expression
        cleaned_tweet = re.sub(r'[\n\r\t\b\f\v]', '', tweet)
        return cleaned_tweet
    
    def remove_urls(self,tweet):
        # Remove URLs using regular expression
        cleaned_tweet = re.sub(r'http\S+|www\S+', '', tweet)
        return cleaned_tweet
    
    def remove_emojis(self,tweet):
        # Remove emojis using regular expression
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
        cleaned_tweet = re.sub(emoji_pattern, '', tweet)
        return cleaned_tweet

    def strip_tweet(self,tweet):
        return tweet.strip().replace('  ',' ')
    
    def process_tweet(self,tweet): 
        if self.mention:
            tweet = self.remove_mentions(tweet)
        if self.hashtag:
            tweet = self.remove_hashtags(tweet)
        if self.unix:
            tweet = self.remove_unix(tweet)
        if self.emoji:
            tweet = self.remove_emojis(tweet)
        if self.url:
            tweet = self.remove_urls(tweet)
        tweet = self.strip_tweet(tweet)
        return tweet

In [25]:
#clean tweets and remove unrealistic ages
df_age = df_age[-(df_age['age']>100) & (df_age['age']>=10)].copy()
users_to_keep = df_age.user_id.unique().tolist()
tw_age = tw_age[tw_age['user_id'].isin(users_to_keep)].copy()
pp = preprocessor(mention=True,url=True,unix=False,emoji=False,hashtag=False)
tw_age['masked_tweet']= tw_age['masked_tweet'].apply(pp.process_tweet)
tw_age = tw_age[tw_age['masked_tweet']!=''].copy()

In [27]:
def transform_df(df_bio,df_text, N=100):
    # df bio
    df_bio = df_bio[['user_id', 'masked_bio','age','birthyear','is_male','gemeente_code','gemeente_name']].fillna('').drop_duplicates()
    
    # process tweets
    df_text = df_text[['user_id', 'masked_tweet', 'date']]
        
    # keep the N most recent text_masked
    df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)
    
    # Filter out entries with rank greater than N
    df_text = df_text[df_text['rank'] <= N]
    
    df_text_grouped = df_text.groupby('user_id')['masked_tweet'].agg(lambda x: '\n'.join(x)).reset_index()
    df_text_grouped = df_text_grouped.rename(columns={'masked_tweet': 'long_text'})
    
    # Merge df_bio with df_text_grouped
    result_df = pd.merge(df_bio, df_text_grouped, on='user_id', how='right').fillna('')

    return result_df

In [28]:
df = transform_df(df_age,tw_age,N=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)


In [36]:
import os
work_dir = '/g100_work/IscrC_mental'
wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(os.path.join(wdata_dir, 'user_classification'),'dutch_data')
df.to_pickle(os.path.join(uc_dir,'data_for_models_dutch_data.pkl'))

In [31]:
df_age = df_age[df_age['user_id'].isin(df.user_id.unique().tolist())].copy()
tw_age = tw_age[tw_age['user_id'].isin(df.user_id.unique().tolist())].copy()

In [32]:
df_age.to_pickle(os.path.join(uc_dir,'dutch_users_age_location_gender.pkl'))

In [33]:
tw_age.to_pickle(os.path.join(uc_dir,'dutch_tweets_age_location_gender.pkl'))

In [39]:
df.to_pickle('/g100/home/userexternal/pbose000/twitter_user_classification/data/user_classification/data_for_models_dutch_data.pkl')