# Preliminaries

In [1]:
import csv
import pandas as pd
import numpy as np
from scipy import sparse as sp
from scipy.sparse.linalg import norm
import sklearn.preprocessing as pp

In [2]:
file = 'training_sample.tsv'

In [3]:
column_names = ["text_tokens", "hashtags", "tweet_id", "present_media", "present_links", "present_domains",\
                "tweet_type", "language", "tweet_timestamp", "engaged_with_user_id", "engaged_with_user_follower_count",\
               "engaged_with_user_following_count", "engaged_with_user_is_verified", "engaged_with_user_account_creation",\
               "engaging_user_id", "engaging_user_follower_count", "engaging_user_following_count", "engaging_user_is_verified",\
               "engaging_user_account_creation", "engaged_follows_engaging", "reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp"]

In [4]:
df = pd.read_csv(file, header=None, names=column_names, delimiter='\x01')

# Preprocessing

In [5]:
df['text_tokens'] = df['text_tokens'].str.split('\t')

def to_hex_list(x):
    output = str(x).split('\t')
#     output = [int(val, 16) for val in str(x).split('\t')] 
    return output

cols_to_process = ['hashtags', 'present_media', 'present_links', 'present_domains']

for col in cols_to_process:  
    df[col] = df[col].apply(lambda x: to_hex_list(x) if isinstance(x, str)  else x)

    
    
cols_to_process = ['tweet_timestamp', 'engaging_user_account_creation', 'reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

for col in cols_to_process:  
    df[col] = df[col].apply(lambda x: pd.Timestamp(x, unit='s'))

In [6]:
targets = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']

In [7]:
movieIds = df.tweet_id.unique()
movieIds.sort()
userIds = df.engaging_user_id.append(df.engaged_with_user_id).unique()
userIds.sort()

m = userIds.size
n = movieIds.size

movieId_to_movieIDX = dict(zip(movieIds, range(0, movieIds.size)))
movieIDX_to_movieId = dict(zip(range(0, movieIds.size), movieIds))

userId_to_userIDX = dict(zip(userIds, range(0, userIds.size )))
userIDX_to_userId = dict(zip(range(0, userIds.size), userIds))

In [8]:
df["react"] = df['reply_timestamp'].notnull() | df['retweet_timestamp'].notnull() | df['retweet_with_comment_timestamp'].notnull() | df['like_timestamp'].notnull()

In [9]:
ratings = pd.concat([df['engaging_user_id'].apply(hash)%10000,
                  df['tweet_id'].apply(hash)%10000,
                  df['reply_timestamp'].notnull(),
                  df['retweet_timestamp'].notnull(),
                  df['retweet_with_comment_timestamp'].notnull(),
                  df['like_timestamp'].notnull(),df["react"]], axis = 1)

ratings.sort_values('engaging_user_id', inplace = True)

# Matrix Creation

In [10]:
R_reply = sp.csr_matrix((ratings.reply_timestamp, (ratings.engaging_user_id, ratings.tweet_id)))

R_retweet = sp.csr_matrix((ratings.retweet_timestamp, (ratings.engaging_user_id, ratings.tweet_id)))

R_retweetwc = sp.csr_matrix((ratings.retweet_with_comment_timestamp, (ratings.engaging_user_id, ratings.tweet_id)))

R_like = sp.csr_matrix((ratings.like_timestamp, (ratings.engaging_user_id, ratings.tweet_id)))

R_react = sp.csr_matrix((ratings.react, (ratings.engaging_user_id, ratings.tweet_id)))

In [11]:
m = R_reply.shape[0]
n = R_reply.shape[1]

# User Similarities

In [12]:
def compute_user_similarities(R,u_id):
    uU = np.empty((m,))
    
    # The loop - left for documentation purposes:
    #for i in range(0,R.shape[0]-1):
    #    uU[i] = compute_pairwise_user_similarity(u_id, i)
    
    # generate an copy of initial sparse matrix
    R_copy = R.copy()
    
    # center the data and normalize it afterwards
    #R_copy.data = R.data - np.repeat(user_avgs,user_cnts)
    R_copy = pp.normalize(R_copy, axis=1)
    
    # make a copy of the user we want to compute the similarities
    u = R_copy[u_id,:].copy()

    #When the dot-product was empty this resulted in an array that was not equal to user size
    #uU = R_copy.dot(u.T).data
    
    # Use a combination of toarray() and flatten() instead:
    uU = R_copy.dot(u.T).toarray().flatten()
    
    return uU

# Neighborhoods

In [13]:
## default values
k = 5
with_abs_sim = False

def create_user_neighborhood(R,u_id, i_id):
    nh = {} ## the neighborhood dict with (user id: similarity) entries
    ## nh should not contain u_id and only include users that have rated i_id; there should be at most k neighbors
    uU = compute_user_similarities(R,u_id)
    uU_copy = uU.copy() ## so that we can modify it, but also keep the original
    
    user_sums = R.sum(axis=1).A1 ## matrix converted to 1-D array via .A1
    user_cnts = (R != 0).sum(axis=1).A1
    user_avgs = user_sums / (user_cnts+1) # devide through total number of tweets to get percentage of interaction
    
    # YOUR CODE HERE
    if with_abs_sim:
        uU_copy = np.absolute(uU_copy)
    
    #straightup exlude the the case where idx=u_id as it will be the first in the array (=1)
    uU_idx = np.argsort(uU_copy)[::-1][1:]
    
    R_dok = R.todok()
    i = 0
    while len(nh) < k:
        # Delete the max element of the similarities (first case = 1)
        # Get the ID of the the newest max + the similarity and assign to the intermediate list
        if (uU_idx[i],i_id) in R_dok:
            nh[uU_idx[i]] = uU[uU_idx[i]]
        i = i+1
        # failsafe for while loop
        if i == len(uU_idx):
            break
    
    
    return nh

# Rating Prediction

In [14]:
def predict_rating(R, u_id, i_id):
    R_dok = R.todok()
    if (u_id, i_id) in R_dok:
        print("user", u_id, "has rated item", i_id, "with", R[u_id, i_id])
    else:
        print("user", u_id, "has not rated item", i_id)
    
    
    user_sums = R.sum(axis=1).A1 ## matrix converted to 1-D array via .A1
    user_cnts = (R != 0).sum(axis=1).A1
    user_avgs = user_sums / (user_cnts+1) # devide through total number of tweets to get percentage of interaction
    
    nh = create_user_neighborhood(R,u_id, i_id)
    
    neighborhood_weighted_avg = 0

    # YOUR CODE HERE
    similarity_sum = 0.5

    for key in nh:
        neighborhood_weighted_avg = neighborhood_weighted_avg + nh[key]*R[key,i_id]
        similarity_sum = similarity_sum + abs(nh[key])
    
    neighborhood_weighted_avg = neighborhood_weighted_avg/similarity_sum
    prediction = neighborhood_weighted_avg
    print(f'prediction {prediction:.4f} (user_avg {user_avgs[u_id]:.4f})')
        
    return prediction

In [15]:
k = 10
predict_rating(R_react,0, 6800)

user 0 has not rated item 6800
prediction 0.0000 (user_avg 0.5000)


0.0