In [1]:
# Import dataset and visualize dataframe
import pandas as pd
df_cols = ['user'] + [i for i in range(25)]
df = pd.read_csv("reddit.csv", nrows=10, names=df_cols).fillna('')

In [2]:
# Clean and store subreddits and user ids as integers
user_ids, subreddit_ids, subreddit_to_id = [], [], {}
counter = 0
with open('reddit.csv', 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        line = line.rstrip().split(',')[1:]
        
        for sr in line:
            if sr not in subreddit_to_id:
                subreddit_to_id[sr] = len(subreddit_to_id)
            user_ids.append(counter)
            subreddit_ids.append(subreddit_to_id[sr])
        
        counter = counter + 1

In [3]:
# Feed data into a sparse matrix (rows = subreddits), (cols = users)
import numpy as np
from scipy.sparse import csr_matrix

rows = np.array(subreddit_ids)
cols = np.array(user_ids)
data = np.ones((len(user_ids),))
n_rows = len(subreddit_to_id)
n_cols = counter

adj = csr_matrix((data, (rows, cols)), shape=(n_rows, n_cols))

print('Subreddit Count:', adj.shape[0])
print('User Count:', adj.shape[1])

Subreddit Count: 15122
User Count: 876961


In [4]:
# Create a list of all possible subreddits
subreddits = list(subreddit_to_id.keys())

# Create a list of n users per subreddit
users_per_subreddit = adj.sum(axis=1).A1

# Convert subreddits to their respective integer key
subreddit_keys = np.array([i for i in range(len(subreddit_to_id))])

In [5]:
# Perform dimensionality reduction on extremely sparse matrix
from sklearn.decomposition import TruncatedSVD 
svd = TruncatedSVD(n_components=100)

from sklearn.preprocessing import normalize 
emb = normalize(svd.fit_transform(adj), norm='l1')

print('Dimensions reduced:', adj.shape[1], '->', emb.shape[1])

Dimensions reduced: 876961 -> 100


In [6]:
# Create a row selector that filters popular subreddits only
row_selector = np.where(users_per_subreddit > 120)

# Perform t-SNE to flatten high dimensional map to 2D
from sklearn.manifold import TSNE
tsne_map = TSNE().fit_transform(emb[row_selector])

In [7]:
# Utilize K-Means to cluster data
from scipy.stats import rankdata
embedded_ranks = np.array([rankdata(c) for c in emb.T]).T

from sklearn.cluster import KMeans
n_clusters = 20
km = KMeans(n_clusters)
clusters = km.fit_predict(embedded_ranks)

In [8]:
# Create a colormap to plot the data
colors = np.array([
          '#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', 
          '#911eb4', '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', 
          '#008080', '#e6beff', '#aa6e28', '#fffac8', '#800000', 
          '#aaffc3', '#808000', '#ffd8b1', '#000080', '#808080', 
          '#000000', '#F87666'])

In [14]:
# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# Make subreddits a numpy array
subreddits = np.array(subreddits)

In [15]:
# Use bokeh library to visualize relationships between subreddits and user interests
import bokeh.plotting as bp
from bokeh.models import HoverTool
from bokeh.io import output_notebook
output_notebook()

fig = bp.figure(plot_width=1000, plot_height=700, title="Subreddit Map by t-SNE", 
                   tools="pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                   x_axis_type=None, y_axis_type=None, min_border=1)
fig.scatter(
    x = tsne_map[:,0],
    y = tsne_map[:,1],
    color = colors[clusters[row_selector]], 
    radius = np.log2(users_per_subreddit[row_selector])/60, 
    source = bp.ColumnDataSource({"subreddit": subreddits[row_selector]})
)


ttlabel = {"/r/":"@subreddit"}
hover = fig.select(dict(type=HoverTool)).tooltips=ttlabel
bp.show(fig)