# Popularity Type

This notebook computes popularity feature for both reader and author

In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import gc
from tqdm.notebook import trange

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

## Reader popularity

In [3]:
column_type = {
    'bert': str, 'hashtags': str, 'tweet_id': str, 'media': str, 'links': str, 'domains': str, 'type': str, 'language': str, 'timestamp': np.uint32,
    'AUTH_user_id':str,'AUTH_follower_count':np.uint32,'AUTH_following_count':np.uint32,'AUTH_verified':bool,'AUTH_account_creation':np.uint32,
    'READ_user_id': str,'READ_follower_count':np.uint32,'READ_following_count':np.uint32,'READ_verified':bool,'READ_account_creation':np.uint32,
    'auth_follows_read': bool,
    'reply_timestamp':np.float32,'retweet_timestamp':np.float32,'quote_timestamp':np.float32,'like_timestamp':np.float32
}

In [9]:
userid, userfo = 'AUTH', 'auth' #'AUTH', 'auth'  #'READ', 'read'

In [10]:
df_merge=None
df_none=True
for idx in trange(504, 504+7*24):
    
    # Read a csv
    df = pd.read_csv(f'/data/recsys/sorted/part-{idx:05d}.csv', 
                     usecols=[userid + '_user_id','type','reply_timestamp','retweet_timestamp','quote_timestamp','like_timestamp'], 
                     dtype=column_type)
    
    # Transform timestamp to integer (1 if timestamp present, 0 otherwise)
    df['rep_T'] = ((df['reply_timestamp']>0) & (df['type']=='TopLevel')).astype(np.uint16)
    df['ret_T'] = ((df['retweet_timestamp']>0) & (df['type']=='TopLevel')) .astype(np.uint16)
    df['quo_T'] = ((df['quote_timestamp']>0) & (df['type']=='TopLevel')).astype(np.uint16)
    df['lik_T'] = ((df['like_timestamp']>0) & (df['type']=='TopLevel')).astype(np.uint16)
    
    df['rep_R'] = ((df['reply_timestamp']>0) & (df['type']=='Retweet')).astype(np.uint16)
    df['ret_R'] = ((df['retweet_timestamp']>0) & (df['type']=='Retweet')) .astype(np.uint16)
    df['quo_R'] = ((df['quote_timestamp']>0) & (df['type']=='Retweet')).astype(np.uint16)
    df['lik_R'] = ((df['like_timestamp']>0) & (df['type']=='Retweet')).astype(np.uint16)
    
    df['rep_Q'] = ((df['reply_timestamp']>0) & (df['type']=='Quote')).astype(np.uint16)
    df['ret_Q'] = ((df['retweet_timestamp']>0) & (df['type']=='Quote')) .astype(np.uint16)
    df['quo_Q'] = ((df['quote_timestamp']>0) & (df['type']=='Quote')).astype(np.uint16)
    df['lik_Q'] = ((df['like_timestamp']>0) & (df['type']=='Quote')).astype(np.uint16)
    
    # One hot for the type
    df['is_T'] = (df['type']=='TopLevel').astype(np.uint16)
    df['is_R'] = (df['type']=='Retweet').astype(np.uint16)
    df['is_Q'] = (df['type']=='Quote').astype(np.uint16)
        
    df = df.drop(columns=['type','reply_timestamp','retweet_timestamp','quote_timestamp','like_timestamp'])
    
    # Cumulated sum (interactions are sorted by timestamp)
    df[['rep_cT', 'ret_cT', 'quo_cT', 'lik_cT', 'rep_cR', 'ret_cR', 'quo_cR', 'lik_cR', 'rep_cQ', 'ret_cQ', 'quo_cQ', 'lik_cQ', 'is_cT', 'is_cR', 'is_cQ']] = df.groupby(userid + '_user_id').cumsum().astype(np.uint16)
    
    if not df_none:
        df = df.set_index(userid + '_user_id')
        
        # FIXED: It is important this line to have the reindex at the end, otherwise data will loose its initial order.
        # Notebook 04.1 fixes that but here is already corrected
        df = df.add(df_merge.loc[df_merge.index.intersection(df.index)].reindex(df.index), fill_value=0)
        df = df.reset_index()
    
    df['is_aT'] = df['is_cT'] - df['is_T']
    df['is_aR'] = df['is_cR'] - df['is_R']
    df['is_aQ'] = df['is_cQ'] - df['is_Q']
    
    # Compute cumulated mean
    df['rep_aT'] = ((df['rep_cT'] - df['rep_T'])/df['is_aT']).fillna(0).astype(np.float16)
    df['ret_aT'] = ((df['ret_cT'] - df['ret_T'])/df['is_aT']).fillna(0).astype(np.float16)
    df['quo_aT'] = ((df['quo_cT'] - df['quo_T'])/df['is_aT']).fillna(0).astype(np.float16)
    df['lik_aT'] = ((df['lik_cT'] - df['lik_T'])/df['is_aT']).fillna(0).astype(np.float16) 
    
    df['rep_aR'] = ((df['rep_cR'] - df['rep_R'])/df['is_aR']).fillna(0).astype(np.float16)
    df['ret_aR'] = ((df['ret_cR'] - df['ret_R'])/df['is_aR']).fillna(0).astype(np.float16)
    df['quo_aR'] = ((df['quo_cR'] - df['quo_R'])/df['is_aR']).fillna(0).astype(np.float16)
    df['lik_aR'] = ((df['lik_cR'] - df['lik_R'])/df['is_aR']).fillna(0).astype(np.float16) 
    
    df['rep_aQ'] = ((df['rep_cQ'] - df['rep_Q'])/df['is_aQ']).fillna(0).astype(np.float16)
    df['ret_aQ'] = ((df['ret_cQ'] - df['ret_Q'])/df['is_aQ']).fillna(0).astype(np.float16)
    df['quo_aQ'] = ((df['quo_cQ'] - df['quo_Q'])/df['is_aQ']).fillna(0).astype(np.float16)
    df['lik_aQ'] = ((df['lik_cQ'] - df['lik_Q'])/df['is_aQ']).fillna(0).astype(np.float16) 
    
    
        
    # Get sum per user so we know the last value for the next parts
    # Rename columns so values will be added to the cumulative ones
    df_tmp = df.loc[:,[userid + '_user_id', 'rep_T', 'ret_T', 'quo_T', 'lik_T', 'rep_R', 'ret_R', 'quo_R', 'lik_R', 'rep_Q', 'ret_Q', 'quo_Q', 'lik_Q', 'is_T', 'is_R', 'is_Q']].groupby(userid + '_user_id').sum()    
    df_tmp = df_tmp.rename(columns={'rep_T': 'rep_cT','ret_T':'ret_cT','quo_T':'quo_cT','lik_T':'lik_cT',
                                    'rep_R': 'rep_cR','ret_R':'ret_cR','quo_R':'quo_cR','lik_R':'lik_cR',
                                    'rep_Q': 'rep_cQ','ret_Q':'ret_cQ','quo_Q':'quo_cQ','lik_Q':'lik_cQ',
                                    'is_T': 'is_cT', 'is_R': 'is_cR', 'is_Q':'is_cQ'})
    
    # Get the total count per user so we know the appearances of this user
#     df_tmp[userid + '_count_user_id'] = df.groupby(userid + '_user_id').count()[userid + '_count_user_id']
        
    if df_none:
        df_merge = df_tmp
    else:
        df_merge = df_merge.add(df_tmp, fill_value=0)
        
    df_none = False
    
    df[['rep_aT', 'ret_aT', 'quo_aT', 'lik_aT', 'rep_aR', 'ret_aR', 'quo_aR', 'lik_aR', 'rep_aQ', 'ret_aQ', 'quo_aQ', 'lik_aQ', 'is_aT', 'is_aR', 'is_aQ']].to_csv(f'/data/recsys/tef_{userfo}_tt/part-{idx:05d}.csv', index=False)
    del df_tmp
        
    if (idx%50 == 0) and (idx > 0):
        df_merge.to_csv(f'/data/recsys/tef_{userfo}_tt/{userfo}_merge4w.csv')

  0%|          | 0/168 [00:00<?, ?it/s]

In [11]:
df_merge.to_csv(f'/data/recsys/tef_{userfo}_tt/{userfo}_merge4w.csv')

In [12]:
df = pd.read_csv(f'/data/recsys/tef_{userfo}_tt/{userfo}_merge4w.csv')

cols1 = ['rep_cT', 'ret_cT', 'quo_cT', 'lik_cT', 'rep_cR', 'ret_cR', 'quo_cR', 'lik_cR', 'rep_cQ', 'ret_cQ', 'quo_cQ', 'lik_cQ']
cols2 = ['rep_aT', 'ret_aT', 'quo_aT', 'lik_aT', 'rep_aR', 'ret_aR', 'quo_aR', 'lik_aR', 'rep_aQ', 'ret_aQ', 'quo_aQ', 'lik_aQ']

for c1, c2 in list(zip(cols1, cols2)):
    df[c2] = df[c1]/df[f'is_c{c1[-1]}']
    
df.drop(columns = cols1).fillna(0).to_csv(f'/data/recsys/tef_{userfo}_tt/{userfo}_tt_merge4w_div.csv', index=False)