In [8]:
import os 
import pandas as pd
import re

In [32]:
class preprocessor():
    """
    Pre-processor for tweets. Cleans mentions,
    urls, emojis, hashtags and unix characters.
    """
    def __init__(self,mention=True,url=True,unix=True,emoji=True,hashtag=False):
        self.mention = mention
        self.url = url
        self.unix = unix
        self.emoji = emoji
        self.hashtag = hashtag

    def remove_mentions(self,tweet):
        # Remove @-mentions using regular expression
        cleaned_tweet = re.sub(r'@\w+', '', tweet).strip().replace('  ',' ')
        return cleaned_tweet
    
    def remove_hashtags(self,tweet):
        # Remove hashtags using regular expression
        cleaned_tweet = re.sub(r'#\w+', '', tweet)
        return cleaned_tweet
    
    def remove_unix(self,tweet):
        # Remove unix characters using regular expression
        cleaned_tweet = re.sub(r'[\n\r\t\b\f\v]', '', tweet)
        return cleaned_tweet
    
    def remove_urls(self,tweet):
        # Remove URLs using regular expression
        cleaned_tweet = re.sub(r'http\S+|www\S+', '', tweet)
        return cleaned_tweet
    
    def remove_emojis(self,tweet):
        # Remove emojis using regular expression
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
        cleaned_tweet = re.sub(emoji_pattern, '', tweet)
        return cleaned_tweet

    def strip_tweet(self,tweet):
        return tweet.strip().replace('  ',' ')
    
    def process_tweet(self,tweet): 
        if self.mention:
            tweet = self.remove_mentions(tweet)
        if self.hashtag:
            tweet = self.remove_hashtags(tweet)
        if self.unix:
            tweet = self.remove_unix(tweet)
        if self.emoji:
            tweet = self.remove_emojis(tweet)
        if self.url:
            tweet = self.remove_urls(tweet)
        tweet = self.strip_tweet(tweet)
        return tweet

    def process_list(self,tweets):
        return [self.process_tweet(tweet) for tweet in tweets]
    
    def process_column(self,tweets):
        return pd.Series(self.process_list(list(tweets)))

In [9]:
work_dir = '/g100_work/IscrC_mental'
wdata_dir = os.path.join(work_dir, 'data')
uc_dir = os.path.join(os.path.join(wdata_dir, 'user_classification'),'german_data')

In [10]:
df = pd.read_pickle(os.path.join(uc_dir,"german_users_with_birthyear.pkl"))

In [11]:
df_tweets = pd.read_pickle(os.path.join(uc_dir,"german_tweets.pkl"))

In [12]:
#restrict to users for which there is a gender and birth year
df.loc[df['male']==1,'is_male'] = 1
df.loc[df['female']==1,'is_male'] = 0
clean_df = df.loc[(df['is_male'].notna()) & (df['birthyear'].notna())].copy()
#manual correction
clean_df.loc[clean_df['user_id']==803169463197691904,'is_male']=1
clean_df = clean_df[['user_id','is_male','birthyear','bio']]

In [13]:
#select last date of tweeting
df_tweets['date']=pd.to_datetime(df_tweets.created_at)
max_dates = df_tweets.groupby('user_id').agg({'date':max}).reset_index()
max_dates['last_year'] = max_dates['date'].dt.year

In [14]:
clean_df = clean_df.merge(max_dates[['user_id','last_year']], on='user_id',how='inner',validate='1:1')

In [15]:
df_all = pd.DataFrame()
for regex in ["geb[\. ].*?([12][09][0-9][0-9])",
              "geboren.+?([12][09][0-9][0-9])",
              "born.+?([12][09][0-9][0-9])",
              "birthday.+?([12][09][0-9][0-9])",
              "bday.+?([12][09][0-9][0-9])",
              "([12][09][0-9][0-9]).*?geb[\. ]",
              "([12][09][0-9][0-9]).*?geboren",
              "\*.*?([12][09][0-9][0-9])",
              "([1-9][0-9]) jahre alt"]:
    
    d = clean_df.bio.str.extract(regex)
    df = clean_df.assign(age = d.astype("Int64"))
    if regex != "([1-9][0-9]) jahre alt":
        df.age = df.last_year-df.age
    
    df_all= pd.concat([df_all,df.loc[df.age.notnull()]])

df_all = df_all.drop_duplicates("user_id")

In [16]:
df_all['birthyear'] = df_all.last_year-df_all.age

In [17]:
def mask_bio(text):
    for regex in ["geb[\. ].*?([12][09][0-9][0-9])",
              "geboren.+?([12][09][0-9][0-9])",
              "born.+?([12][09][0-9][0-9])",
              "birthday.+?([12][09][0-9][0-9])",
              "bday.+?([12][09][0-9][0-9])",
              "([12][09][0-9][0-9]).*?geb[\. ]",
              "([12][09][0-9][0-9]).*?geboren",
              "\*.*?([12][09][0-9][0-9])",
              "([1-9][0-9]) jahre alt"]:
        text = re.sub(regex,'',text)
    return text

In [18]:
df_all['masked_bio'] = df_all.bio.apply(mask_bio)

In [62]:
df = df_all.merge(df_tweets[['text','date','user_id']],on='user_id',how='inner',validate='1:m')

In [63]:
df['RT'] = df['text'].str.startswith('RT @')

In [64]:
df = df[df['RT']==False].copy()

In [65]:
df = df[-(df['age']>100) & (df['age']>=10)].copy()

In [68]:
#remove mentions, urls and empty tweets
pp = preprocessor(mention=True,url=True,unix=False,emoji=False,hashtag=False)
df['text']= df['text'].apply(pp.process_tweet)
df = df[df['text']!=''].copy()

In [69]:
def transform_df(df, N=100):
    # df bio
    df_bio = df[['user_id', 'masked_bio', 'age', 'is_male']].fillna('').drop_duplicates()
    
    # process tweets
    df_text = df[['user_id', 'text', 'date', 'age', 'is_male']]
    
    # keep the N most recent text_masked
    df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)
    
    # Filter out entries with rank greater than N
    df_text = df_text[df_text['rank'] <= N]
    
    df_text_grouped = df_text.groupby('user_id')['text'].agg(lambda x: '\n'.join(x)).reset_index()
    df_text_grouped = df_text_grouped.rename(columns={'text': 'long_text'})
    
    # Merge df_bio with df_text_grouped
    result_df = pd.merge(df_bio, df_text_grouped, on='user_id', how='inner')

    return result_df

dft = transform_df(df, N=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['rank'] = df_text.groupby('user_id')['date'].rank(method='first', ascending=False)


In [70]:
dft.to_pickle(os.path.join(uc_dir,'data_for_models_german_data.pkl'))

In [73]:
dft.to_pickle('/g100/home/userexternal/pbose000/twitter_user_classification/data/user_classification/data_for_models_german_data.pkl')