In [8]:
import os
import pandas as pd
import re
import sqlite3

In [9]:
def check_users_in_second_table(db_file, first_table, second_table):
    # Create a database connection
    conn = sqlite3.connect(db_file)

    try:
        # Check if all users in the first table are present in the second table
        query = f"""
            SELECT DISTINCT user_id
            FROM {first_table}
            WHERE NOT EXISTS (
                SELECT 1
                FROM {second_table}
                WHERE {first_table}.user_id = {second_table}.user_id
            )
        """
        cursor = conn.cursor()
        cursor.execute(query)
        result = cursor.fetchall()

        if not result:
            print("All users in the first table are present in the second table.")
        else:
            print("Some users in the first table are not present in the second table.")

    except sqlite3.Error as e:
        print(f"Error reading data from the database: {e}")

    finally:
        # Close the database connection
        conn.close()

db_file = '/g100_work/IscrC_mental/data/database/MENTALISM.db'
db_file = '/data/mentalism/data/database/MENTALISM.db'
first_table = 'user_regioncoded'
second_table = 'tweets'
check_users_in_second_table(db_file, first_table, second_table)

All users in the first table are present in the second table.


In [10]:
def fetch_db_columns(db_file, table_name, column_names=None):

    # Build the list of attributes to select in the query
    column_names_str = ", ".join(column_names) if column_names else "*"

    # Create a database connection
    conn = sqlite3.connect(db_file)

    try:
        # Query specified attributes from the table
        query = f"SELECT DISTINCT {column_names_str} FROM {table_name}"
        df = pd.read_sql_query(query, conn)
        return df

    except sqlite3.Error as e:
        print(f"Error reading data from the database: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of an error

    finally:
        # Close the database connection
        conn.close()

# Example usage
db_file = '/g100_work/IscrC_mental/data/database/MENTALISM.db'
db_file = '/data/mentalism/data/database/MENTALISM.db'
table_name = 'user_regioncoded'
column_names = ['user_id', 'username', 'full_name', 'location',
          'join_year', 'tweets', 'following', 'followers', 'bio']

df_users = fetch_db_columns(db_file, table_name, column_names=column_names)
print(df_users.shape)
df_users.head()

(462533, 9)


Unnamed: 0,user_id,username,full_name,location,join_year,tweets,following,followers,bio
0,9269,simon,simone brunozzi,"Ceres, Solar System, Milky Way",2006,14357,1599,11712,Tech. Founder. Investor. Trying to become usel...
1,11193,robgarofalo,roberto garofalo,"Roma, Lazio",2006,1,24,6,
2,11881,Agaponeo,cristian bracci,Rome,2006,4150,131,396,"Sono una persona normale, padre di una belliss..."
3,12243,pecus,matteo fogli,"Torino, Italia",2006,5543,1956,681,CEO of @madebymodo under disguise. Obssessed w...
4,14983,FAFrigenti,francesco a frigenti,"Milan, Lombardy",2006,397,180,67,


In [12]:
# Read in the tweets of the users: all pickle files that start with "filename" in "dir"
def read_data(dirpath, filename):
    files = [file for file in os.listdir(dirpath) if file.startswith(filename)]
    data = pd.DataFrame()
    for file in files:
        data = pd.concat([data, pd.read_pickle(dirpath + file)])
    return data

dirpath = '/data/mentalism/data/user_classification/user_regioncoded_100_tweets/'
filename = 'user_tweets_chunk'
df_tweets = read_data(dirpath, filename)
print(df_tweets.shape)
df_tweets.head()

(23822501, 8)


Unnamed: 0,tweet_id,user_id,created_at,text,retweet_text,language,likes,retweets
0,1.657844e+18,1620154325354807296,2023-05-14 20:24:38+00:00,"@_N3m3si__ non è teatro alla scala lool, è una...",,it,1,0
1,1.657844e+18,1620154325354807296,2023-05-14 20:22:23+00:00,a mattia regaleranno una lezione di zumba #ami...,,it,5,0
2,1.657838e+18,1620154325354807296,2023-05-14 19:58:51+00:00,secondo il mio modestissimo parere avendo in f...,,it,2,0
3,1.657837e+18,1620154325354807296,2023-05-14 19:55:23+00:00,"Isobel diventerà un gran ballerina, mattia beh...",,it,2,0
4,1.657836e+18,1620154325354807296,2023-05-14 19:50:13+00:00,@pastelcoeur purtroppo si,,it,0,0


In [13]:
assert len(df_tweets.user_id.unique()) == len(df_users.user_id.unique()), f'We have tweets for {len(df_tweets.user_id.unique())}/{len(df_users.user_id.unique())} users only.'

In [14]:
# This is how the final result of this notebook should look like
# Except that "masked_bio" should be "bio" and not masked, as we as long_text
df_test = pd.read_pickle('../data/user_classification/data_for_models_test.pkl')
df_test.head()
# further user's features (columns) might be wanted

Unnamed: 0,user_id,is_male,age,masked_bio,long_text
0,7071362,True,47,Con le ruote per terra \r\nSentire il mio pass...,Lo prendo come un attestato di stima\nPer la s...
1,9420092,False,31,Quello che gli altri pensano della tua persona...,Lovely zucca\nSta cambiando proprio il tempo\n...
2,11749412,False,33,Love me? Great. Hate me? Even better. Think I'...,Guardi l’ottimo è uno dei motivi per cui ancor...
3,14088579,False,41,Sii il cambiamento che vuoi vedere nel mondo M...,"Io direi per lui, se generalizziamo ci dividia..."
4,14281831,True,44,at home I feel like a tourist.,"Ore 6, buio pesto. Ribadiamo con forza #TeamOr..."


In [15]:
class preprocessor():
    """
    Pre-processor for tweets. Cleans mentions,
    urls, emojis, hashtags and unix characters.
    """
    def __init__(self,mention=True,url=True,unix=True,emoji=True,hashtag=False):
        self.mention = mention
        self.url = url
        self.unix = unix
        self.emoji = emoji
        self.hashtag = hashtag

    def remove_mentions(self,tweet):
        # Remove @-mentions using regular expression
        cleaned_tweet = re.sub(r'@\w+', '', tweet).strip().replace('  ',' ')
        return cleaned_tweet
    
    def remove_hashtags(self,tweet):
        # Remove hashtags using regular expression
        cleaned_tweet = re.sub(r'#\w+', '', tweet)
        return cleaned_tweet
    
    def remove_unix(self,tweet):
        # Remove unix characters using regular expression
        cleaned_tweet = re.sub(r'[\n\r\t\b\f\v]', '', tweet)
        return cleaned_tweet
    
    def remove_urls(self,tweet):
        # Remove URLs using regular expression
        cleaned_tweet = re.sub(r'http\S+|www\S+', '', tweet)
        return cleaned_tweet
    
    def remove_emojis(self,tweet):
        # Remove emojis using regular expression
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
        cleaned_tweet = re.sub(emoji_pattern, '', tweet)
        return cleaned_tweet

    def strip_tweet(self,tweet):
        return tweet.strip().replace('  ',' ')
    
    def process_tweet(self,tweet): 
        if self.mention:
            tweet = self.remove_mentions(tweet)
        if self.hashtag:
            tweet = self.remove_hashtags(tweet)
        if self.unix:
            tweet = self.remove_unix(tweet)
        if self.emoji:
            tweet = self.remove_emojis(tweet)
        if self.url:
            tweet = self.remove_urls(tweet)
        tweet = self.strip_tweet(tweet)
        return tweet

    def process_list(self,tweets):
        return [self.process_tweet(tweet) for tweet in tweets]
    
    def process_column(self,tweets):
        return pd.Series(self.process_list(list(tweets)))

In [17]:
# CLEAN tweets

# replace NAs with ''
df_tweets['text'] = df_tweets['text'].fillna('')

# remove retweets
is_retweet = df_tweets['text'].str.startswith('RT @')
df_tweets = df_tweets[~is_retweet]

# Select last tweeting date for each user
df_tweets['date']=pd.to_datetime(df_tweets.created_at)
max_dates = df_tweets.groupby('user_id').agg({'date':max}).reset_index()
max_dates['last_year'] = max_dates['date'].dt.year
max_dates.rename(columns={'date':'last_date'},inplace=True)

#remove mentions, urls and empty tweets
pp = preprocessor(mention=True,url=True,unix=False,emoji=False,hashtag=False)
df_tweets['text']= df_tweets['text'].apply(pp.process_tweet)
df_tweets_clean = df_tweets[df_tweets['text']!='']

print('How many users are left with tweets after cleaning?')
print('From:', len(df_tweets.user_id.unique()))
print('  To:', len(df_tweets_clean.user_id.unique()))

How many users are left with tweets after cleaning?
From: 450313
  To: 440684


In [18]:
# CLEAN bios

print(df_users.shape)

# replace NAs with ''
df_users['bio'] = df_users['bio'].fillna('')

#remove mentions, urls and empty tweets
pp = preprocessor(mention=True,url=True,unix=False,emoji=False,hashtag=False)
df_users['bio'] = df_users['bio'].apply(pp.process_tweet)

print(df_users.shape)

(462533, 9)
(462533, 9)


In [19]:
# FUSION N MOST RECENT TWEETS

# Keep the N most recent tweets
N=100
df_tweets_clean['rank'] = df_tweets_clean.groupby('user_id')['date'].rank(method='first', ascending=False)
df_text = df_tweets_clean[df_tweets_clean['rank'] <= N]

# Group tweets by user_id
df_text_grouped = df_text.groupby('user_id')['text'].agg(lambda x: '\n'.join(x)).reset_index()
df_text_grouped = df_text_grouped.rename(columns={'text': 'long_text'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tweets_clean['rank'] = df_tweets_clean.groupby('user_id')['date'].rank(method='first', ascending=False)


In [20]:
# Merge with max_dates
df = df_text_grouped.merge(max_dates, on='user_id', how='left')
df
# Merge with user_regioncoded

Unnamed: 0,user_id,long_text,last_date,last_year
0,9269,surely interesting for you.\nMost billionaires...,2023-04-01 06:20:25+00:00,2023
1,11193,Finché utilizzerete l’ISEE senza debellare l’e...,2022-08-10 05:48:15+00:00,2022
2,11881,"hello, i just did a purchase on the store. Try...",2022-08-02 13:32:03+00:00,2022
3,12243,I requested my twitter archive. The decision t...,2022-12-18 09:35:39+00:00,2022
4,14983,Purtroppo è uno di quei profili da Premier…spe...,2023-02-06 08:04:38+00:00,2023
...,...,...,...,...
440679,1618897592779251712,"FEG Airdrop, Last call!🙌\nSHRAPNEL Free Mint i...",2023-04-12 00:54:40+00:00,2023
440680,1618987257653870592,Ma la figa la copri con i capelli ?\nNon ho be...,2023-02-04 08:04:44+00:00,2023
440681,1619403522004377600,‘Su la Testa’ sostiene la comunità ‘Il piccolo...,2023-04-08 12:55:35+00:00,2023
440682,1619640767785336833,"Richieste folli , irriguardose come irriverent...",2023-02-27 05:12:09+00:00,2023


In [67]:
df_users

Unnamed: 0,user_id,username,full_name,location,join_year,tweets,following,followers,bio
0,9269,simon,simone brunozzi,"Ceres, Solar System, Milky Way",2006,14357,1599,11712,Tech. Founder. Investor. Trying to become usel...
1,11193,robgarofalo,roberto garofalo,"Roma, Lazio",2006,1,24,6,
2,11881,Agaponeo,cristian bracci,Rome,2006,4150,131,396,"Sono una persona normale, padre di una belliss..."
3,12243,pecus,matteo fogli,"Torino, Italia",2006,5543,1956,681,CEO of under disguise. Obssessed with performa...
4,14983,FAFrigenti,francesco a frigenti,"Milan, Lombardy",2006,397,180,67,
...,...,...,...,...,...,...,...,...,...
462528,1618987257653870592,enrico_pregl,enrico pregl,"Ledro, Trentino-Alto Adige",2023,2,20,0,
462529,1619403522004377600,Su_La_Testa_tw,su la testa,Cusano Milanino,2023,57,5,16,L’Associazione rivolta a chiunque voglia impeg...
462530,1619481480756879361,FShayagan,fatima shayagan,"Torino, Piemonte",2023,1,17,1,زنده باد انسانیت
462531,1619640767785336833,antonio69094920,antonio antonelli,Molise,2023,7,48,5,"Nell’ etica del rispetto della Giustizia , del..."


In [21]:
# create joint dataframe with all the features needed for training
df_join = df_users.merge(df, on='user_id', how='left')
df_join.last_year = df_join.last_year.fillna(0).astype('int32')
df_join.long_text = df_join.long_text.fillna('').astype('str')
df_join

Unnamed: 0,user_id,username,full_name,location,join_year,tweets,following,followers,bio,long_text,last_date,last_year
0,9269,simon,simone brunozzi,"Ceres, Solar System, Milky Way",2006,14357,1599,11712,Tech. Founder. Investor. Trying to become usel...,surely interesting for you.\nMost billionaires...,2023-04-01 06:20:25+00:00,2023
1,11193,robgarofalo,roberto garofalo,"Roma, Lazio",2006,1,24,6,,Finché utilizzerete l’ISEE senza debellare l’e...,2022-08-10 05:48:15+00:00,2022
2,11881,Agaponeo,cristian bracci,Rome,2006,4150,131,396,"Sono una persona normale, padre di una belliss...","hello, i just did a purchase on the store. Try...",2022-08-02 13:32:03+00:00,2022
3,12243,pecus,matteo fogli,"Torino, Italia",2006,5543,1956,681,CEO of under disguise. Obssessed with performa...,I requested my twitter archive. The decision t...,2022-12-18 09:35:39+00:00,2022
4,14983,FAFrigenti,francesco a frigenti,"Milan, Lombardy",2006,397,180,67,,Purtroppo è uno di quei profili da Premier…spe...,2023-02-06 08:04:38+00:00,2023
...,...,...,...,...,...,...,...,...,...,...,...,...
462528,1618987257653870592,enrico_pregl,enrico pregl,"Ledro, Trentino-Alto Adige",2023,2,20,0,,Ma la figa la copri con i capelli ?\nNon ho be...,2023-02-04 08:04:44+00:00,2023
462529,1619403522004377600,Su_La_Testa_tw,su la testa,Cusano Milanino,2023,57,5,16,L’Associazione rivolta a chiunque voglia impeg...,‘Su la Testa’ sostiene la comunità ‘Il piccolo...,2023-04-08 12:55:35+00:00,2023
462530,1619481480756879361,FShayagan,fatima shayagan,"Torino, Piemonte",2023,1,17,1,زنده باد انسانیت,,NaT,0
462531,1619640767785336833,antonio69094920,antonio antonelli,Molise,2023,7,48,5,"Nell’ etica del rispetto della Giustizia , del...","Richieste folli , irriguardose come irriverent...",2023-02-27 05:12:09+00:00,2023


In [22]:
# save to pickle
outf = '/data/mentalism/data/user_classification/user_regioncoded_features.pkl'
df_join.to_pickle(outf)

In [23]:
!du -sh $outf

1.3G	/data/mentalism/data/user_classification/user_regioncoded_features.pkl


In [43]:
def twitter_features_extra(
    path_data: str,
    ):

    # Read the pickle dataframe
    if path_data.endswith('.pkl'):
        df = pd.read_pickle(path_data)
    else:
        raise NotImplementedError

    # Set the index to the user_id
    df.set_index('user_id', inplace=True)
    # create input text
    # Separating text and numbers with a space
    df['username_sep'] = df['username'].str.replace(r'([a-zA-Z])(\d)', r'\1 \2').\
                        str.replace(r'(\d)([a-zA-Z])', r'\1 \2')
    # concat info
    df['input_texts']  = 'NAME:' + ' "' + df['full_name'] + '". ' +\
                         'USERNAME:' + ' "'+  df['username_sep'] + '". ' + \
                         'JOINED:' + ' "' + df['join_year'].astype(str) + '". ' +\
                         'TWEETS:' + ' "' + df['tweets'].astype(str) + '". ' + \
                         'FOLLOWING:' + ' "' + df['following'].astype(str) + '". ' +\
                         'FOLLOWERS:' + ' "' + df['followers'].astype(str) + '". ' + \
                         'BIO:' + ' "' + df['bio'] + '". ' + \
                         'TEXT:' + ' "' + df['long_text'] + '".'

    # check if there are any missing values in input texts (shouldn't be the case)
    if df.input_texts.isnull().values.any():
        raise ValueError('The dataframe contains missing input_texts')

    return df['input_texts'], None

path_data = '/data/mentalism/data/user_classification/user_regioncoded_features.pkl'
input_texts, gold_labels = twitter_features_extra(path_data)

In [45]:
input_texts

user_id
9269                   NAME: "simone brunozzi". USERNAME: "simon". JO...
11193                  NAME: "roberto garofalo". USERNAME: "robgarofa...
11881                  NAME: "cristian bracci". USERNAME: "Agaponeo"....
12243                  NAME: "matteo fogli". USERNAME: "pecus". JOINE...
14983                  NAME: "francesco a frigenti". USERNAME: "FAFri...
                                             ...                        
1618987257653870592    NAME: "enrico pregl". USERNAME: "enrico_pregl"...
1619403522004377600    NAME: "su la testa". USERNAME: "Su_La_Testa_tw...
1619481480756879361    NAME: "fatima shayagan". USERNAME: "FShayagan"...
1619640767785336833    NAME: "antonio antonelli". USERNAME: "antonio6...
1620154325354807296    NAME: "emmu". USERNAME: "filoepersegno". JOINE...
Name: input_texts, Length: 462533, dtype: object

In [48]:
# save sample to pickle
outf = '/data/mentalism/data/user_classification/user_regioncoded_features_sample.pkl'
df[:1000].to_pickle(outf)