### Import libraries.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import praw
import pprint
import urllib.request
import codecs
import re
import multiprocessing
import time
import networkx as nx
import scipy
import sys
import pickle

from IPython.display import display, clear_output
from networkx.drawing.nx_agraph import graphviz_layout as layout

#import joblib
#from joblib import Parallel, delayed
#from queue import Queue

### Import data.

In [None]:
reddit_data = pd.read_csv('../Data/44_million_votes.txt',sep='\t')

### Variables for matrix construction.

In [None]:
# Get each unique subreddit/user interactions.
subreddit_user_data = reddit_data[['USERNAME','SUBREDDIT']].drop_duplicates()

# Get all unique subreddits and users in a list.
all_subreddits = list(subreddit_user_data.SUBREDDIT.unique())
reddit_usernames = list(subreddit_user_data.USERNAME.unique())

# Numerical index as a dictionary for each subreddit.
subreddit_index = {all_subreddits[i]: i for i in range(len(all_subreddits))}

### Construct matrices.

In [None]:
# Set aside memory for adjacency matrix. Approx file size ~40 GB.
reddit_adj_mat = np.memmap('reddit_adj_mat',dtype='int16',shape=(len(all_subreddits),len(all_subreddits)),mode='w+')

# For each user add one to each subreddit they voted. Generates symmetric matrix.
def add_to_adj(user):
    indices = np.array(subreddit_user_data.loc[subreddit_user_data['USERNAME'].isin([user]),'SUBREDDIT'])
    indices = [subreddit_index[sub] for sub in all_subreddits if sub in indices]
    for i in indices:
        reddit_adj_mat[i,indices] += 1

print('Start')
start = time.time()

# Start constructing adjacency matrix. Time taken ~4.5 hours with 8 CPUs.
p = multiprocessing.Pool(8)
p.map(add_to_adj,reddit_usernames)
print(time.time() - start)

# Convert dense matrix to sparse matrix. Time taken ~3 min.
reddit_adj_mat_sp = scipy.sparse.csr_matrix(reddit_adj_mat)
print(time.time() - start)

# Save sparse matrix. Approx file size ~500 MB.
scipy.sparse.save_npz('reddit_adj_mat_sp',reddit_adj_mat_sp)
print(time.time() - start)