# Reader Popularity

This notebook computes popularity feature for the reader using the val set.

In [3]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import gc
from tqdm.notebook import trange

In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [6]:
column_type = {
    'bert': str, 'hashtags': str, 'tweet_id': str, 'media': str, 'links': str, 'domains': str, 'type': str, 'language': str, 'timestamp': np.uint32,
    'AUTH_user_id':str,'AUTH_follower_count':np.uint32,'AUTH_following_count':np.uint32,'AUTH_verified':bool,'AUTH_account_creation':np.uint32,
    'READ_user_id': str,'READ_follower_count':np.uint32,'READ_following_count':np.uint32,'READ_verified':bool,'READ_account_creation':np.uint32,
    'auth_follows_read': bool,
    'reply_timestamp':np.float32,'retweet_timestamp':np.float32,'quote_timestamp':np.float32,'like_timestamp':np.float32
}

## Reader popularity

In [7]:
# READER POPULARITY

df_merge=None
df_none=True
for idx in trange(504, 504+7*24):
    
    # Read a csv
    df = pd.read_csv(f'/data/recsys/sorted/part-{idx:05d}.csv', 
                     usecols=['READ_user_id', 'reply_timestamp','retweet_timestamp','quote_timestamp','like_timestamp'], 
                     dtype=column_type)
    
    # Transform timestamp to integer (1 if timestamp present, 0 otherwise)
    df['reply_timestamp'] = (df['reply_timestamp']>0).astype(int)
    df['retweet_timestamp'] = (df['retweet_timestamp']>0).astype(int)
    df['quote_timestamp'] = (df['quote_timestamp']>0).astype(int)
    df['like_timestamp'] = (df['like_timestamp']>0).astype(int)
            
    # Cumulated sum (interactions are sorted by timestamp)
    df[['reply', 'retweet', 'quote', 'like']] = df.groupby('READ_user_id').cumsum()
    
    # Cumulated count to know how many repetitions we have
    df['READ_count_user_id'] = df.groupby('READ_user_id').cumcount()
    
    if not df_none:
        df = df.set_index('READ_user_id')
        
        # FIXED: It is important this line to have the reindex at the end, otherwise data will loose its initial order.
        # Notebook 04.1 fixes that but here is already corrected
        df = df.add(df_merge.loc[df_merge.index.intersection(df.index)].reindex(df.index), fill_value=0)
        df = df.reset_index()
    
    # Compute cumulated mean
    df['avg_reply'] = ((df['reply'] - df['reply_timestamp'])/df['READ_count_user_id']).fillna(0)
    df['avg_retweet'] = ((df['retweet'] - df['retweet_timestamp'])/df['READ_count_user_id']).fillna(0)
    df['avg_quote'] = ((df['quote'] - df['quote_timestamp'])/df['READ_count_user_id']).fillna(0)
    df['avg_like'] = ((df['like'] - df['like_timestamp'])/df['READ_count_user_id']).fillna(0) 
        
    # Get sum per user so we know the last value for the next parts
    # Rename columns so values will be added to the cumulative ones
    df_tmp = df.loc[:,['READ_user_id', 'reply_timestamp','retweet_timestamp','quote_timestamp','like_timestamp']].groupby('READ_user_id').sum()    
    df_tmp = df_tmp.rename(columns={'reply_timestamp': 'reply','retweet_timestamp':'retweet',
                                        'quote_timestamp':'quote','like_timestamp':'like'})
    
    # Get the total count per user so we know the appearances of this user
    df_tmp[['READ_count_user_id']] = df.groupby('READ_user_id').count().READ_count_user_id
    
    if df_none:
        df_merge = df_tmp
    else:
        df_merge = df_merge.add(df_tmp, fill_value=0)
        
    df_none = False
    
    df[['avg_reply', 'avg_retweet', 'avg_quote', 'avg_like']].to_csv(f'/data/recsys/read_pop/part-{idx:05d}.csv', index=False)
    del df_tmp

  0%|          | 0/168 [00:00<?, ?it/s]

In [8]:
df_merge.to_csv('/data/recsys/read_pop/read_merge4w.csv')

In [9]:
df = pd.read_csv('/data/recsys/read_pop/read_merge4w.csv', dtype={'reply': np.float32, 'retweet': np.float32, 'quote': np.float32, 'like': np.float32})
df['reply'] = df['reply']/df['READ_count_user_id']
df['retweet'] = df['retweet']/df['READ_count_user_id']
df['quote'] = df['quote']/df['READ_count_user_id']
df['like'] = df['like']/df['READ_count_user_id']

In [10]:
df.drop(columns=['READ_count_user_id']).to_csv('/data/recsys/read_pop/read_merge4w_div.csv', index=False)

In [11]:
df.to_csv('/data/recsys/read_pop/read_merge4w_div_counts.csv', index=False)