# Recommending subreddit to users using Collaborative Filtering

## Importing necessary Modules

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

## Loading Reddit dataset

In [2]:
df = pd.read_csv('reddit_data.csv')
df.head()

Unnamed: 0,username,subreddit,utc
0,kabanossi,photoshopbattles,1482748000.0
1,kabanossi,GetMotivated,1482748000.0
2,kabanossi,vmware,1482748000.0
3,kabanossi,carporn,1482748000.0
4,kabanossi,DIY,1482747000.0


In [3]:
df.shape

(14000000, 3)

In [4]:
df.drop('utc',axis=1, inplace=True)

In [5]:
df1 = df.iloc[:350000,:]
df1.head()

Unnamed: 0,username,subreddit
0,kabanossi,photoshopbattles
1,kabanossi,GetMotivated
2,kabanossi,vmware
3,kabanossi,carporn
4,kabanossi,DIY


In [6]:
subreddits_per_user1 = df1.groupby(['username'])['subreddit'].value_counts()
subreddits_df1 = pd.DataFrame(subreddits_per_user1)
subreddits_df1
new_1 = subreddits_df1.reset_index(level = 0, drop = False)
new_1.rename(columns = {'subreddit':'counts'}, inplace=True)
new_1 = new_1.reset_index()
new_1

Unnamed: 0,subreddit,username,counts
0,worldnews,-SA-HatfulOfHollow,11
1,news,-SA-HatfulOfHollow,1
2,reddevils,-SA-HatfulOfHollow,1
3,soccer,-SA-HatfulOfHollow,1
4,AskReddit,-Stormcloud-,38
5,pokemongo,-Stormcloud-,26
6,pokemon,-Stormcloud-,24
7,asoiaf,-Stormcloud-,21
8,WoT,-Stormcloud-,20
9,friendsafari,-Stormcloud-,20


In [7]:
sub_df = pd.DataFrame(new_1['subreddit'])
sub_df

Unnamed: 0,subreddit
0,worldnews
1,news
2,reddevils
3,soccer
4,AskReddit
5,pokemongo
6,pokemon
7,asoiaf
8,WoT
9,friendsafari


# Finding Item Similarity

## Let's create a pivot table of subreddits to Users

### The rows are subreddit and columns are username. And the values in the matrix are the counts for a specific subreddit by a specific user.

In [8]:
rating_mat = new_1.pivot( index='subreddit', columns='username', values = "counts" ).reset_index(drop=True)

### Fill with 0, where users have not subscribed the subreddit

In [9]:
rating_mat.fillna( 0, inplace = True )

In [10]:
rating_mat.shape

(5508, 550)

In [11]:
rating_mat.head(4100)

username,-SA-HatfulOfHollow,-Stormcloud-,-_-_-_-otalp-_-_-_-,-goodguygeorge,01is,0mn17h3047,2d2c,414D59,62718743217326214821,777louisdeal,...,xkcd_transcriber,xxsandmanxx,yd-oc,yescalculators,ygtsrt,yzlautum,zaviex,zerkle,zestysock,zombiegamer723
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Calculating the item distances and similarities

In [12]:
subreddit_sim = 1 - pairwise_distances( rating_mat.as_matrix(), metric="correlation" )

In [13]:
subreddit_sim.shape

(5508, 5508)

In [14]:
subreddit_sim_df = pd.DataFrame( subreddit_sim )

In [15]:
subreddit_sim_df.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5498,5499,5500,5501,5502,5503,5504,5505,5506,5507
0,1.0,-0.002097,-0.002727,-0.002097,-0.002369,-0.002097,-0.002097,-0.002097,-0.002097,0.146028,...,-0.005625,-0.00279,0.048156,-0.002428,-0.002544,-0.003446,-0.002097,-0.002097,-0.002097,-0.002097
1,-0.002097,1.0,-0.002369,-0.001821,-0.002058,-0.001821,-0.001821,-0.001821,-0.001821,-0.002749,...,-0.004887,-0.002424,-0.005811,-0.002109,-0.00221,-0.002993,-0.001821,-0.001821,-0.001821,-0.001821
2,-0.002727,-0.002369,1.0,-0.002369,-0.002677,-0.002369,-0.002369,-0.002369,-0.002369,-0.003576,...,-0.006356,-0.003152,-0.007559,-0.002743,-0.002874,-0.003893,-0.002369,-0.002369,-0.002369,-0.002369
3,-0.002097,-0.001821,-0.002369,1.0,-0.002058,-0.001821,-0.001821,-0.001821,-0.001821,-0.002749,...,-0.004887,-0.002424,-0.005811,-0.002109,-0.00221,-0.002993,-0.001821,-0.001821,-0.001821,-0.001821
4,-0.002369,-0.002058,-0.002677,-0.002058,1.0,-0.002058,-0.002058,0.996692,-0.002058,-0.003106,...,-0.005521,-0.002738,-0.006566,-0.002383,-0.002497,-0.003382,-0.002058,-0.002058,-0.002058,-0.002058
5,-0.002097,-0.001821,-0.002369,-0.001821,-0.002058,1.0,-0.001821,-0.001821,-0.001821,-0.002749,...,0.239456,-0.002424,-0.005811,-0.002109,-0.00221,-0.002993,-0.001821,-0.001821,-0.001821,-0.001821
6,-0.002097,-0.001821,-0.002369,-0.001821,-0.002058,-0.001821,1.0,-0.001821,-0.001821,-0.002749,...,-0.004887,-0.002424,-0.005811,-0.002109,-0.00221,-0.002993,-0.001821,-0.001821,-0.001821,-0.001821
7,-0.002097,-0.001821,-0.002369,-0.001821,0.996692,-0.001821,-0.001821,1.0,-0.001821,-0.002749,...,-0.004887,-0.002424,-0.005811,-0.002109,-0.00221,-0.002993,-0.001821,-0.001821,-0.001821,-0.001821
8,-0.002097,-0.001821,-0.002369,-0.001821,-0.002058,-0.001821,-0.001821,-0.001821,1.0,-0.002749,...,-0.004887,-0.002424,-0.005811,-0.002109,-0.00221,-0.002993,-0.001821,-0.001821,-0.001821,-0.001821
9,0.146028,-0.002749,-0.003576,-0.002749,-0.003106,-0.002749,-0.002749,-0.002749,-0.002749,1.0,...,-0.007376,-0.003658,-0.008771,-0.003183,-0.003335,-0.004518,-0.002749,-0.002749,-0.002749,-0.002749


## Finding similar subreddit to "soccer"

In [16]:
sub_df['similarity'] = subreddit_sim_df.iloc[3]
sub_df.columns = ['subreddit', 'similarity']
sub_df.head()

Unnamed: 0,subreddit,similarity
0,worldnews,-0.002097
1,news,-0.001821
2,reddevils,-0.002369
3,soccer,1.0
4,AskReddit,-0.002058


In [17]:
sub_df.sort_values( ["similarity"], ascending = False )[1:10]

Unnamed: 0,subreddit,similarity
3,soccer,1.0
3325,creepyPMs,1.0
473,Futurology,1.0
1352,dataisbeautiful,0.795478
2726,snapchat,0.445909
4561,MURICA,0.439982
190,holdthemoan,0.331442
1758,OldSchoolCool,0.296249
989,interestingasfuck,0.21246


### That means anyone who subscribe "soccer" , the above top 10 subreddits  can be recommended to him or her.

# Utility function to find similar subreddit

In [18]:
def get_similar_subreddits( subreddit, topN = 5 ):
  sub_df['similarity'] = subreddit_sim_df.iloc[subreddit]
  top_n = sub_df.sort_values( ["similarity"], ascending = False )[0:topN]
  print( "Similar subreddits to: " )
  return top_n

In [19]:
get_similar_subreddits(3)

Similar subreddits to: 


Unnamed: 0,subreddit,similarity
1809,LifeProTips,1.0
3,soccer,1.0
3325,creepyPMs,1.0
473,Futurology,1.0
1352,dataisbeautiful,0.795478


In [20]:
get_similar_subreddits( 4 )

Similar subreddits to: 


Unnamed: 0,subreddit,similarity
4,AskReddit,1.0
4637,Overwatch,0.996692
4707,trailerparkboys,0.996692
7,asoiaf,0.996692
3072,OutOfTheLoop,0.996692


### The item similarity has been able to find out the subreddit that can be recommended. And these recommendations look appropriate to some extent.

## Advantages:
###### Simple and fast.
###### Provides relevant results to some extent.

## disadvantages:
##### Influenced by the activity of similar users.
