In [50]:
import os
import pandas as pd
import re
import sqlite3

In [None]:
def get_user_info(db_file, table_name):
    # Create a database connection
    conn = sqlite3.connect(db_file)

    try:
        # Query user_id, username, and bio from the specified table
        query = f"SELECT DISTINCT user_id, username, bio FROM {table_name}"
        user_info_df = pd.read_sql_query(query, conn)
        return user_info_df

    except sqlite3.Error as e:
        print(f"Error reading data from the database: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of an error

    finally:
        # Close the database connection
        conn.close()

# Example usage
db_file = '/g100_work/IscrC_mental/data/database/MENTALISM.db'
table_name = 'user_regioncoded'

df_users = get_user_info(db_file, table_name)
df_users.head()

In [51]:
# Read in the tweets of the users: all pickle files that start with "filename" in "dir"
def read_data(dir, filename):
    files = [file for file in os.listdir(dir) if file.startswith(filename)]
    data = pd.DataFrame()
    for file in files:
        data = pd.concat([data, pd.read_pickle(dir + file)])
    return data

filename = 'user_tweets_chunk'
df_tweets = read_data('../', filename)[:100]
df_tweets.head()

Unnamed: 0,tweet_id,user_id,created_at,text,retweet_text
841056,1636794428571504645,1617291345332256769,2023-03-17 18:19:36+00:00,RT @fanpage: ULTIM'ORA,
841057,1628349273799622656,1617291345332256769,2023-02-22 11:01:34+00:00,@TgLa7 M,
133948,1643534631503818752,1600726899856130049,2023-04-05 08:42:46+00:00,@mari_arena75 Buongiorno buona giornata amore ...,
133949,1643534557415653376,1600726899856130049,2023-04-05 08:42:28+00:00,@Antonel08210523 Buongiorno bellissima foto co...,
133950,1643534441933881344,1600726899856130049,2023-04-05 08:42:01+00:00,@SCorroppoli Buongiorno bellissima come stai,


In [52]:
# This is how the final result of this notebook should look like
# Except that "masked_bio" should be "bio" and not masked, as we as long_text
df_test = pd.read_pickle('../data/user_classification/data_for_models_test.pkl')
df_test.head()

Unnamed: 0,user_id,is_male,age,masked_bio,long_text
0,7071362,True,47,Con le ruote per terra \r\nSentire il mio pass...,Lo prendo come un attestato di stima\nPer la s...
1,9420092,False,31,Quello che gli altri pensano della tua persona...,Lovely zucca\nSta cambiando proprio il tempo\n...
2,11749412,False,33,Love me? Great. Hate me? Even better. Think I'...,Guardi l’ottimo è uno dei motivi per cui ancor...
3,14088579,False,41,Sii il cambiamento che vuoi vedere nel mondo M...,"Io direi per lui, se generalizziamo ci dividia..."
4,14281831,True,44,at home I feel like a tourist.,"Ore 6, buio pesto. Ribadiamo con forza #TeamOr..."
...,...,...,...,...,...
1115,1492598652535197706,True,19,,Get Fortn1t3 from App Valley!
1116,1500578379036246016,False,33,,Lei non poteva andare perche e personaggio pub...
1117,1511727485737648132,True,57,,E tantissimi sono contro la GIÒ MELON DONNA FA...
1118,1578278161598521347,False,19,𝘕𝘢𝘳𝘤𝘰𝘭𝘦𝘱𝘴𝘺 𝘨𝘰𝘵 𝘮𝘦 𝘧𝘦𝘦𝘭𝘪𝘯𝘨 𝘴𝘵𝘢𝘨𝘦 𝘧𝘳𝘪𝘨𝘩𝘵\n\n✧˚ ༘...,"È stato un brutto sogno e basta, vero?\nÈ sett..."


In [53]:
class preprocessor():
    """
    Pre-processor for tweets. Cleans mentions,
    urls, emojis, hashtags and unix characters.
    """
    def __init__(self,mention=True,url=True,unix=True,emoji=True,hashtag=False):
        self.mention = mention
        self.url = url
        self.unix = unix
        self.emoji = emoji
        self.hashtag = hashtag

    def remove_mentions(self,tweet):
        # Remove @-mentions using regular expression
        cleaned_tweet = re.sub(r'@\w+', '', tweet).strip().replace('  ',' ')
        return cleaned_tweet
    
    def remove_hashtags(self,tweet):
        # Remove hashtags using regular expression
        cleaned_tweet = re.sub(r'#\w+', '', tweet)
        return cleaned_tweet
    
    def remove_unix(self,tweet):
        # Remove unix characters using regular expression
        cleaned_tweet = re.sub(r'[\n\r\t\b\f\v]', '', tweet)
        return cleaned_tweet
    
    def remove_urls(self,tweet):
        # Remove URLs using regular expression
        cleaned_tweet = re.sub(r'http\S+|www\S+', '', tweet)
        return cleaned_tweet
    
    def remove_emojis(self,tweet):
        # Remove emojis using regular expression
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
        cleaned_tweet = re.sub(emoji_pattern, '', tweet)
        return cleaned_tweet

    def strip_tweet(self,tweet):
        return tweet.strip().replace('  ',' ')
    
    def process_tweet(self,tweet): 
        if self.mention:
            tweet = self.remove_mentions(tweet)
        if self.hashtag:
            tweet = self.remove_hashtags(tweet)
        if self.unix:
            tweet = self.remove_unix(tweet)
        if self.emoji:
            tweet = self.remove_emojis(tweet)
        if self.url:
            tweet = self.remove_urls(tweet)
        tweet = self.strip_tweet(tweet)
        return tweet

    def process_list(self,tweets):
        return [self.process_tweet(tweet) for tweet in tweets]
    
    def process_column(self,tweets):
        return pd.Series(self.process_list(list(tweets)))

In [54]:
# work_dir = '/g100_work/IscrC_mental'
# wdata_dir = os.path.join(work_dir, 'data')
# uc_dir = os.path.join(os.path.join(wdata_dir, 'user_classification'),'german_data')

In [55]:
# df = pd.read_pickle(os.path.join(uc_dir,"german_users_with_birthyear.pkl"))

In [56]:
# df_tweets = pd.read_pickle(os.path.join(uc_dir,"german_tweets.pkl"))

In [57]:
# #restrict to users for which there is a gender and birth year
# df.loc[df['male']==1,'is_male'] = 1
# df.loc[df['female']==1,'is_male'] = 0
# clean_df = df.loc[(df['is_male'].notna()) & (df['birthyear'].notna())].copy()
# #manual correction
# clean_df.loc[clean_df['user_id']==803169463197691904,'is_male']=1
# clean_df = clean_df[['user_id','is_male','birthyear','bio']]

In [58]:
# CLEAN DATA

# remove retweets
is_retweet = df_tweets['text'].str.startswith('RT @')
df_tweets = df_tweets[~is_retweet]

# Select last tweeting date for each user
df_tweets['date']=pd.to_datetime(df_tweets.created_at)
max_dates = df_tweets.groupby('user_id').agg({'date':max}).reset_index()
max_dates['last_year'] = max_dates['date'].dt.year
max_dates.rename(columns={'date':'last_date'},inplace=True)

#remove mentions, urls and empty tweets
pp = preprocessor(mention=True,url=True,unix=False,emoji=False,hashtag=False)
df_tweets['text']= df_tweets['text'].apply(pp.process_tweet)
df = df_tweets[df_tweets['text']!='']

In [59]:
# FUSION N MOST RECENT TWEETS

# Keep the N most recent tweets
N=100
df_tweets['rank'] = df_tweets.groupby('user_id')['date'].rank(method='first', ascending=False)
df_text = df_tweets[df_tweets['rank'] <= N]

# Group tweets by user_id
df_text_grouped = df_text.groupby('user_id')['text'].agg(lambda x: '\n'.join(x)).reset_index()
df_text_grouped = df_text_grouped.rename(columns={'text': 'long_text'})

In [64]:
# Merge with max_dates
df = df_text_grouped.merge(max_dates, on='user_id', how='left')
df
# Merge with user_regioncoded

Unnamed: 0,user_id,long_text,last_date,last_year
0,1600726899856130049,Buongiorno buona giornata amore mio come stai\...,2023-04-05 08:42:46+00:00,2023
1,1617291345332256769,M,2023-02-22 11:01:34+00:00,2023


In [65]:
df_test

Unnamed: 0,user_id,is_male,age,masked_bio,long_text
0,7071362,True,47,Con le ruote per terra \r\nSentire il mio pass...,Lo prendo come un attestato di stima\nPer la s...
1,9420092,False,31,Quello che gli altri pensano della tua persona...,Lovely zucca\nSta cambiando proprio il tempo\n...
2,11749412,False,33,Love me? Great. Hate me? Even better. Think I'...,Guardi l’ottimo è uno dei motivi per cui ancor...
3,14088579,False,41,Sii il cambiamento che vuoi vedere nel mondo M...,"Io direi per lui, se generalizziamo ci dividia..."
4,14281831,True,44,at home I feel like a tourist.,"Ore 6, buio pesto. Ribadiamo con forza #TeamOr..."
...,...,...,...,...,...
1115,1492598652535197706,True,19,,Get Fortn1t3 from App Valley!
1116,1500578379036246016,False,33,,Lei non poteva andare perche e personaggio pub...
1117,1511727485737648132,True,57,,E tantissimi sono contro la GIÒ MELON DONNA FA...
1118,1578278161598521347,False,19,𝘕𝘢𝘳𝘤𝘰𝘭𝘦𝘱𝘴𝘺 𝘨𝘰𝘵 𝘮𝘦 𝘧𝘦𝘦𝘭𝘪𝘯𝘨 𝘴𝘵𝘢𝘨𝘦 𝘧𝘳𝘪𝘨𝘩𝘵\n\n✧˚ ༘...,"È stato un brutto sogno e basta, vero?\nÈ sett..."


In [34]:
def transform_df(df, N=100):
    # df bio
    df_bio = df[['user_id', 'masked_bio', 'age', 'is_male']].fillna('').drop_duplicates()
    
    # process tweets
    df_text = df[['user_id', 'text', 'date', 'age', 'is_male']]
    
    
    # Merge df_bio with df_text_grouped
    result_df = pd.merge(df_bio, df_text_grouped, on='user_id', how='inner')

    return result_df

dft = transform_df(df, N=100)

KeyError: "['masked_bio', 'age', 'is_male'] not in index"

In [None]:
# clean_df = clean_df.merge(max_dates[['user_id','last_year']], on='user_id',how='inner',validate='1:1')

In [None]:
# df_all = pd.DataFrame()
# for regex in ["geb[\. ].*?([12][09][0-9][0-9])",
#               "geboren.+?([12][09][0-9][0-9])",
#               "born.+?([12][09][0-9][0-9])",
#               "birthday.+?([12][09][0-9][0-9])",
#               "bday.+?([12][09][0-9][0-9])",
#               "([12][09][0-9][0-9]).*?geb[\. ]",
#               "([12][09][0-9][0-9]).*?geboren",
#               "\*.*?([12][09][0-9][0-9])",
#               "([1-9][0-9]) jahre alt"]:
    
#     d = clean_df.bio.str.extract(regex)
#     df = clean_df.assign(age = d.astype("Int64"))
#     if regex != "([1-9][0-9]) jahre alt":
#         df.age = df.last_year-df.age
    
#     df_all= pd.concat([df_all,df.loc[df.age.notnull()]])

# df_all = df_all.drop_duplicates("user_id")

In [None]:
df_all = df_all.drop_duplicates("user_id")

In [None]:
df = df_all.merge(df_tweets[['text','date','user_id']],on='user_id',how='inner',validate='1:m')

In [None]:
df['RT'] = df['text'].str.startswith('RT @')

In [None]:
df = df[df['RT']==False].copy()

In [None]:
df = df[-(df['age']>100) & (df['age']>=10)].copy()

In [None]:
#remove mentions, urls and empty tweets
pp = preprocessor(mention=True,url=True,unix=False,emoji=False,hashtag=False)
df['text']= df['text'].apply(pp.process_tweet)
df = df[df['text']!=''].copy()

In [None]:
def transform_df(df, N=100):
    # df bio
    df_bio = df[['user_id', 'masked_bio', 'age', 'is_male']].fillna('').drop_duplicates()
    
    # process tweets
    df_text = df[['user_id', 'text', 'date', 'age', 'is_male']]
    
    # keep the N most recent text_masked
    df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)
    
    # Filter out entries with rank greater than N
    df_text = df_text[df_text['rank'] <= N]
    
    df_text_grouped = df_text.groupby('user_id')['text'].agg(lambda x: '\n'.join(x)).reset_index()
    df_text_grouped = df_text_grouped.rename(columns={'text': 'long_text'})
    
    # Merge df_bio with df_text_grouped
    result_df = pd.merge(df_bio, df_text_grouped, on='user_id', how='inner')

    return result_df

dft = transform_df(df, N=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)


In [None]:
dft.to_pickle(os.path.join(uc_dir,'data_for_models_german_data.pkl'))

In [None]:
dft.to_pickle('/g100/home/userexternal/pbose000/twitter_user_classification/data/user_classification/data_for_models_german_data.pkl')