# Author Popularity

This notebook computes popularity for the author

In [1]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
client.restart()

0,1
Client  Scheduler: tcp://127.0.0.1:36601  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 6  Cores: 24  Memory: 125.60 GiB


In [2]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import gc
from tqdm.notebook import trange

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

## Author popularity

In [6]:
# df_merge = pd.read_csv('/data/recsys/auth_pop_tweet/auth_merge.csv')
# df_none = False

In [None]:
# AUTHOR POPULARITY

df_merge=None
df_none=True
column_type = {
    'bert': str, 'hashtags': str, 'tweet_id': str, 'media': str, 'links': str, 'domains': str, 'type': str, 'language': str, 'timestamp': np.uint32,
    'AUTH_user_id':str,'AUTH_follower_count':np.uint32,'AUTH_following_count':np.uint32,'AUTH_verified':bool,'AUTH_account_creation':np.uint32,
    'READ_user_id': str,'READ_follower_count':np.uint32,'READ_following_count':np.uint32,'READ_verified':bool,'READ_account_creation':np.uint32,
    'auth_follows_read': bool,
    'reply_timestamp':np.float32,'retweet_timestamp':np.float32,'quote_timestamp':np.float32,'like_timestamp':np.float32
}

for idx in trange(0, 504):
    
    # Read a csv
    df = pd.read_csv(f'/data/recsys/sorted/part-{idx:05d}.csv', 
                     usecols=['AUTH_user_id', 'reply_timestamp','retweet_timestamp','quote_timestamp','like_timestamp', 'tweet_id'], 
                     dtype=column_type)
    
    # Transform timestamp to integer (1 if timestamp present, 0 otherwise)
    df['reply_timestamp'] = (df['reply_timestamp']>0).astype(int)
    df['retweet_timestamp'] = (df['retweet_timestamp']>0).astype(int)
    df['quote_timestamp'] = (df['quote_timestamp']>0).astype(int)
    df['like_timestamp'] = (df['like_timestamp']>0).astype(int)
            
    # Cumulated sum (interactions are sorted by timestamp)
    df[['reply', 'retweet', 'quote', 'like']] = df.groupby('AUTH_user_id').cumsum()
    
    # Cumulated count to know how many repetitions we have
    df['AUTH_count_user_id'] = df.groupby('AUTH_user_id').cumcount()
    
    # Cumulated count to know how many repetitions we have
    df['tweet_id_count'] = df.groupby('tweet_id').cumcount()
    
     # Cumulated count to know how many repetitions we have
    df['like_id_count'] = df.groupby(['AUTH_user_id', 'tweet_id'])['like_timestamp'].cumsum() - df['like_timestamp']
    df['retweet_id_count'] = df.groupby(['AUTH_user_id', 'tweet_id'])['retweet_timestamp'].cumsum() - df['retweet_timestamp']
    df['quote_id_count'] = df.groupby(['AUTH_user_id', 'tweet_id'])['quote_timestamp'].cumsum() - df['quote_timestamp']
    df['reply_id_count'] = df.groupby(['AUTH_user_id', 'tweet_id'])['reply_timestamp'].cumsum() - df['reply_timestamp']
    
    df = df.drop(columns=['tweet_id'])
    
    if not df_none:
        df = df.set_index('AUTH_user_id')
       
        # FIXED: It is important this line to have the reindex at the end, otherwise data will loose its initial order.
        # Notebook 04.1 fixes that but here is already corrected
        df = df.add(df_merge.loc[df_merge.index.intersection(df.index)].reindex(df.index), fill_value=0)
        df = df.reset_index()
    
    df['a_count'] = df['AUTH_count_user_id'] - df['tweet_id_count']
    
    # Compute cumulated mean
    df['avg_reply'] = ((df['reply'] - df['reply_id_count'] - df['reply_timestamp']) / df['a_count']).fillna(0)
    df['avg_retweet'] = ((df['retweet'] - df['retweet_id_count'] - df['retweet_timestamp']) / df['a_count']).fillna(0)
    df['avg_quote'] = ((df['quote'] - df['quote_id_count'] - df['quote_timestamp']) / df['a_count']).fillna(0)
    df['avg_like'] = ((df['like'] - df['like_id_count'] - df['like_timestamp']) / df['a_count']).fillna(0) 

    # Get sum per user so we know the last value for the next parts
    # Rename columns so values will be added to the cumulative ones
    df_tmp = df.loc[:, ['AUTH_user_id', 'reply_timestamp','retweet_timestamp','quote_timestamp','like_timestamp']].groupby('AUTH_user_id').sum()    
    df_tmp = df_tmp.rename(columns={'reply_timestamp': 'reply','retweet_timestamp':'retweet',
                                        'quote_timestamp':'quote','like_timestamp':'like'})
    
    # Get the total count per user so we know the appearances of this user
    df_tmp[['AUTH_count_user_id']] = df.groupby('AUTH_user_id').count().AUTH_count_user_id    
    
    if df_none:
        df_merge = df_tmp
    else:
        df_merge = df_merge.add(df_tmp, fill_value=0)
        
    df_none = False
    
    df[['avg_reply', 'avg_retweet', 'avg_quote', 'avg_like', 'AUTH_count_user_id', 'a_count']].to_csv(f'/data/recsys/auth_pop_tweet/part-{idx:05d}.csv', index=False)
    del df_tmp
    
    if idx==336:
        df_merge.to_csv('/data/recsys/auth_pop_tweet/auth_pop2w.csv')
        
    if (idx%50 == 0) and (idx > 0):
        df_merge.to_csv('/data/recsys/auth_pop_tweet/auth_merge.csv')

  0%|          | 0/504 [00:00<?, ?it/s]

In [None]:
df_merge.to_csv('/data/recsys/auth_pop_tweet/auth_merge.csv')