In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances 

In [2]:
follow_df = pd.read_csv("follows.csv", header=0, names=['follower_id', 'followee_id'], dtype=np.int32)

interest_df = pd.read_csv("interests.csv", header=0, names=['user_id', 'category'], 
                          dtype={'user_id':np.int32, 'category':np.str})

In [3]:
def strip(word):
    return word.strip()

# stripping spaces from strings in interest_df
interest_df['category'] = interest_df['category'].apply(strip)

In [4]:
follower_id = follow_df['follower_id']
followee_id = follow_df['followee_id']
interests_id = interest_df['user_id']

follower_s = set(follower_id)
followee_s = set(followee_id)
interests_id_s = set(interests_id)

# set of all user ids with no duplicates 
ids_s = follower_s | followee_s | interests_id_s

In [5]:
n_users = len(ids_s) 
largest_id = max(ids_s)
n_interests = interest_df['category'].nunique()

In [6]:
# users relationship matrix
user_mat = np.zeros((largest_id + 1, largest_id + 1))

# ratings[i,j] = 1 if user i follows user j
user_mat[follower_id, followee_id] = 1

indices = np.array(list(ids_s))

# only keeping the relevant rows and columns
user_mat = user_mat[indices, :]
user_mat = user_mat[:, indices]

# defining a dataframe for ratings 
users_df = pd.DataFrame(data=user_mat, index=indices, columns=indices, dtype=np.int32)

In [7]:
users_df.head()

Unnamed: 0,2,4,6,8,10,12,14,16,18,20,...,7388,7390,7392,7394,7396,7398,7400,7402,7404,7406
2,0,1,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,1,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,1,1,0,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10,1,1,1,1,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
print("Number of follower-followee relationships:", users_df.sum().sum())

Number of follower-followee relationships: 49505


In [9]:
# users interest-category matrix
interest_df_grp = interest_df.groupby('user_id')['category'].apply(list)
interests_list = list(set(interest_df['category']))

cat_df = pd.DataFrame(np.zeros((n_users, n_interests)), index=users_df.index, 
                      columns=interests_list)

for index in interest_df_grp.index:
    for interest_cat in interest_df_grp[index]:
        cat_df.loc[index, interest_cat] = 1

cat_df = cat_df.astype(int)

In [10]:
cat_df.head()

Unnamed: 0,Zambia,Harare,Banking,Hong Kong,Basketball,Marines,Tibet,Loans,Milwaukee,Melanesia,...,Turkey,Cyprus,New England,Sculpture,Cologne,Gymnastics,Islam,Electricity,Ethics,Daegu
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
def follow_prob(users_df, cat_df):
    """Returns the probability of user i following user j in p[i, j]"""
    
    # create a ratings matrix based on user relationships and interest categories
    ratings_df = pd.concat([users_df, cat_df], axis=1)
    
    # calculate jaccard distances between users
    dist = pairwise_distances(ratings_df, metric='jaccard')
    
    # set division by zero nan values to the maximum value i.e 1
    dist[np.isnan(dist)] = 1
    
    #calculate similarities
    sim = 1 - dist
    sim = sim/sim.sum(axis=1)[:, None]
    
    # create the probability dataframe
    p = pd.DataFrame(sim.dot(users_df), index=users_df.index, columns=users_df.columns)
    
    return p

In [12]:
def make_test(users_df, n):
    """
    Creates a mock test dataset by randomly masking n followees for users who follow more than 20 other users
    This dataset will be used to test how many of these user ids the recommendation system will capture
    """
    
    # selecting user who have at least 20 followees
    test_users_df = users_df[users_df.sum(axis=1)>20]
    test_users_df = test_users_df.astype(int)
    
    # dictionary to store the followees for these users
    test_users_dict = {}
    
    # dictionary to store the masked followee ids
    test_zero_dict = {}
    
    for user in test_users_df.index:
        
        # values for test_users dictionary: their complete list of followees
        test_users_dict[user] = (test_users_df.columns.values[test_users_df.loc[user]==1])
        
        # values for test_zero_users dictionary: list of masked followees
        test_zero_dict[user] = random.sample(list(test_users_dict[user]), n)
        
    # setting zeros in the user matrix for the random list generated before 
    test_df = users_df.copy()
    for user in test_zero_dict:
        test_df.loc[user, test_zero_dict[user]] = 0
        
    return(test_users_dict, test_zero_dict, test_df)

In [13]:
def recommend_top(p, k, ind, followed):
    """
    Given probability matrix p, returns top k recommendations for the user index 'ind' 
    'followed' is the list of IDs that the user is currently following so they will not be considered for recommendation  
    """
    
    temp = p.loc[ind, ~p.columns.isin(followed)].nlargest(k)
    
    return temp.index.values
    

In [16]:
# defining functions to calculate precision and recalls for two input sets
def prec(predicted, correct):
    """Computes precision"""
    
    p = set(predicted)
    c = set(correct)
    return len(p & c)/len(p)

def rec(predicted, correct):
    """Computes recall"""
    
    p = set(predicted)
    c = set(correct)
    return len(p & c)/len(c)

In [17]:
def prec_recomm(users_df, cat_df, n, k):
    """ Calculates precision of the recommendation system  
        n is the numbers of masked followees
        recommends top k users
    """
    
    # make a test set
    test_users_dict, test_zero_dict, test_users_df = make_test(users_df, n)

    # calculate proabilities
    p = follow_prob(test_users_df, cat_df)
    
    prec_eval = []

    for user in test_zero_dict:
        
        # list of all followees of the user
        all_follow = test_users_dict[user]
        
        # keeping record of actual followees of the user which we have masked for testing
        actual_follow = test_zero_dict[user]
        
        # all the current followees of the users in the mock test set - these IDs will not be recommended to the user again
        already_follow = list(set(all_follow) - set(actual_follow))
            
        recom_follow = recommend_top(p, k, user, already_follow)
        
        prec_eval.append(prec(recom_follow, actual_follow))
        
    return prec_eval

In [22]:
def recall_recomm(users_df, cat_df, n, k):
    """ Calculates recall of the recommendation system  
        n is the numbers of masked followees
        recommends top k users
    """
    
    # make a test set
    test_users_dict, test_zero_dict, test_users_df = make_test(users_df, n)

    # calculate proabilities
    p = follow_prob(test_users_df, cat_df)
    
    recall_eval = []

    for user in test_zero_dict:
        
        # list of all followees of the user
        all_follow = test_users_dict[user]
        
        # keeping record of actual followees of the user which we have masked for testing
        actual_follow = test_zero_dict[user]
        
        # all the current followees of the users in the mock test set - these IDs will not be recommended to the user again
        already_follow = list(set(all_follow) - set(actual_follow))
            
        recom_follow = recommend_top(p, k, user, already_follow)
        
        recall_eval.append(rec(recom_follow, actual_follow))
        
    return recall_eval

In [19]:
for k in [1,5,10,20]:
    
    p_list = prec_recomm(users_df, cat_df, 10, k)

    print("average precision @ {}: {:1.2f}".format(k, np.mean(p_list)))



average precision @ 1: 0.61
average precision @ 5: 0.37
average precision @ 10: 0.25
average precision @ 20: 0.17


In [23]:
for k in [10,20]:
    
    r_list = recall_recomm(users_df, cat_df, 10, k)

    print("average recall @ {}: {:1.2f}".format(k, np.mean(r_list)))



average recall @ 10: 0.26
average recall @ 20: 0.34
