# Recommender System for Reddit Users

The problem statement and data can be downloaded from this [link](https://www.kaggle.com/colemaclean/subreddit-interactions)

In [1]:
import datetime
import implicit
import random
import time
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from scipy import stats,sparse
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from implicit.als import AlternatingLeastSquares
%matplotlib inline

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [17]:
class reddit_rec():
    @timeit
    def __init__(self, path):
        '''
        Initialises the class, reads in the data, generates counts
        '''
        df = pd.read_csv(path + '/reddit_data.csv')
        self.df = df.groupby(['username', 'subreddit'])['subreddit'].count().reset_index(name = 'counts')
        self.n_users = self.df.username.nunique()
        self.n_items = self.df.subreddit.nunique()

        
    def __repr__(self):
        '''
        Printable representation of the class 
        '''
        repr_str = "\nNum. of Users: {}\nNum of subreddits: {}"\
              .format(self.n_users, self.n_items)
        return repr_str
    
    
    def _reduce_sparsity(self):
        '''
        Removes sparse subreddits and users
        '''
        # drop subreddits with less than 5 users
        self.df = self.df[(self.df.groupby('subreddit')['username'].count() > 5).loc[self.df['subreddit']].reset_index(drop=True)]
        self.df.reset_index(drop=True, inplace=True)
        
        # drop users with less than 5 posted subreddits
        self.df = self.df[(self.df.groupby('username')['subreddit'].count() > 5).loc[self.df['username']].reset_index(drop=True)]
        self.df.reset_index(drop=True, inplace=True)
        
        # Update counts
        self.n_users = self.df.username.nunique()
        self.n_items = self.df.subreddit.nunique()
        return None
    
    def _label_encode(self):
        '''
        Encodes users and subreddits
        '''
        d = defaultdict(LabelEncoder)
        self.df['username'] = d['username'].fit_transform(self.df['username'])
        self.df['subreddit'] = d['subreddit'].fit_transform(self.df['subreddit'])
        return None
    
    
    def _train_test_split(self, ratings, split_count, fraction=None):
        """
        Split recommendation data into train and test sets
        """
        train = ratings.copy().tocoo()
        test = ratings.copy()

        if fraction:
            try:
                user_index = np.random.choice(
                    np.where(np.bincount(train.row) >= split_count * 2)[0], 
                    replace=False,
                    size=np.int32(np.floor(fraction * train.shape[0]))
                ).tolist()
            except:
                print(('Not enough users with > {} '
                      'interactions for fraction of {}')\
                      .format(2*k, fraction))
                raise
        else:
            user_index = range(train.shape[0])

        train = train.tolil()

        for user in user_index:
            test_ratings = np.random.choice(ratings.getrow(user).indices, 
                                            size=split_count, 
                                            replace=False)
            train[user, test_ratings] = 0

        test[test != 0] = 1
        return train.tocsr(), test, user_index
    
    
    @timeit
    def preprocess(self):
        """
        Performs preprocessing and train test split
        """
        
        self._reduce_sparsity()
        self._label_encode()
        X = csr_matrix((self.df['counts'], (self.df['username'], self.df['subreddit'])), shape=(self.n_users, self.n_items))
        self.train, self.test, self.user_index = self._train_test_split(X, 5, fraction=0.2)
        return self
    
    def _auc_score(self, predictions, test):
        '''
        This simple function will output the area under the curve using sklearn's metrics. 
        '''
        fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
        return metrics.auc(fpr, tpr)
    
    @timeit
    def calc_mean_auc(self):
        '''
        This function will calculate the mean AUC by user for any user that had their user-item matrix altered. 
        '''

        store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
        popularity_auc = [] # To store popular AUC scores
        pop_items = np.array(self.test.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
        item_vecs = csr_matrix(self.model.item_factors.T)
        for user in self.user_index: # Iterate through each user that had an item altered
            training_row = self.train[user,:].toarray().reshape(-1) # Get the training set row
            zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
            # Get the predicted values based on our user/item vectors
            user_vec = csr_matrix(self.model.user_factors)[user,:]
            pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
            # Get only the items that were originally zero
            # Select all ratings from the MF prediction for this user that originally had no iteraction
            actual = self.test[user,:].toarray()[0,zero_inds].reshape(-1) 
            # Select the binarized yes/no interaction pairs from the original full data
            # that align with the same pairs in training 
            pop = pop_items[zero_inds] # Get the item popularity for our chosen items
            store_auc.append(self._auc_score(pred, actual)) # Calculate AUC for the given user and store
            popularity_auc.append(self._auc_score(pop, actual)) # Calculate AUC using most popular and score
        # End users iteration

        return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))  
       # Return the mean AUC rounded to three decimal places for both test and popularity benchmark
    
    @timeit
    def train_recommender(self, alpha = 40, 
                          model = AlternatingLeastSquares(factors=20, regularization = 0.1, iterations=15)):
        """
        Trains a matrix factorization based recommender system for implicit feedback datasets
        """
        
        alpha = alpha
        # train the model on a sparse matrix of item/user/confidence weights
        self.model = copy.deepcopy(model)
        self.model.fit((self.train.T * alpha).astype('double'))



## EDA

In [18]:
rd = reddit_rec('../data')

'__init__'  9223.69 ms


In [19]:
rd


Num. of Users: 22610
Num of subreddits: 34967

In [21]:
rd.df.head(10)

Unnamed: 0,username,subreddit,counts
0,--ANUSTART-,AOImmortals,2
1,--ANUSTART-,Addons4Kodi,1
2,--ANUSTART-,AdviceAnimals,7
3,--ANUSTART-,AskReddit,14
4,--ANUSTART-,Assistance,9
5,--ANUSTART-,CombatFootage,1
6,--ANUSTART-,Documentaries,1
7,--ANUSTART-,FantasyPL,3
8,--ANUSTART-,FiftyFifty,1
9,--ANUSTART-,Fitness,7


In [8]:
# Top Subreddits
rd.df.groupby('subreddit')['counts'].sum().sort_values(ascending=False).head(10)

subreddit
AskReddit          1030290
politics            367860
The_Donald          216939
nfl                 173883
leagueoflegends     157663
worldnews           156605
funny               152921
nba                 150985
pics                143496
news                140492
Name: counts, dtype: int64

In [10]:
# No of users posting in a subreddit
rd.df.groupby('subreddit')['username'].count().describe()

count    34967.000000
mean        25.522979
std        227.224157
min          1.000000
25%          1.000000
50%          2.000000
75%          7.000000
max      14491.000000
Name: username, dtype: float64

In [11]:
# No of subreddits a user posts in
rd.df.groupby('username')['subreddit'].count().describe()

count    22610.000000
mean        39.472004
std         40.855805
min          1.000000
25%         10.000000
50%         28.000000
75%         56.000000
max        723.000000
Name: subreddit, dtype: float64

In [22]:
sparsity=round(1.0-len(rd.df)/float(rd.n_users * rd.n_items),3)
print('The sparsity level of this dataset is ' +  str(sparsity*100) + '%')

The sparsity level of this dataset is 99.9%


## Preprocessing

In [23]:
rd.preprocess()

'preprocess'  4267.78 ms



Num. of Users: 18622
Num of subreddits: 9645

In [24]:
sparsity=round(1.0-len(rd.df)/float(rd.n_users * rd.n_items),3)
print('The sparsity level of this dataset is ' +  str(sparsity*100) + '%')

The sparsity level of this dataset is 99.5%


In [25]:
rd.df.head()

Unnamed: 0,username,subreddit,counts
0,0,144,1
1,0,160,7
2,0,388,14
3,0,408,9
4,0,980,1


In [26]:
# The altered version of the original data with a certain percentage of the user-item pairs 
# that originally had interaction set back to zero
rd.train

<18622x9645 sparse matrix of type '<type 'numpy.int64'>'
	with 818210 stored elements in Compressed Sparse Row format>

In [27]:
# A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
# compares with the actual interactions
rd.test

<18622x9645 sparse matrix of type '<type 'numpy.int64'>'
	with 836830 stored elements in Compressed Sparse Row format>

## Train Recommender System

In [28]:
rd.train_recommender(model = AlternatingLeastSquares(factors=20, regularization = 0.1, iterations=50))



'train_recommender'  22847.00 ms


## Evaluate Model

In [29]:
model_auc, pop_auc = rd.calc_mean_auc()

'calc_mean_auc'  63204.82 ms


In [30]:
print('Mean AUC for our model {}\nMean AUC for a popularity based model {}'.format(model_auc, pop_auc))

Mean AUC for our model 0.932
Mean AUC for a popularity based model 0.897


## Limitations
* If you want to recommend items to a new user, it requires re-training the whole model