## This file:
- Imports directories; ones with entries are saved as 'non_empty_folders'.
- csv-s with page likes are opened per folder. Only users with at least to likes are retained, and put into dictionaries as values where keys are the approprite page_ids. 
- Three lists of sparse matrix parameters are build: [row indices], [column indices], [matrix elements]. These lists are needed to build an upper triangular sparse matrix, then a symmetric matrix. 

In [1]:
import glob
import os, io
import pandas as pd
import numpy as np
from numpy import genfromtxt
import csv
import pickle
from collections import Counter
import time
from scipy.sparse import coo_matrix

In [17]:
directory = "C://FOLDERS//CEU//Capstone Project//Data//nagykategek"

In [18]:
folders = os.listdir(directory)

In [19]:
len(folders)

11879

<br>
### Filtering out empty folders

In [20]:
empty = 0
not_empty = 0
empty_folders = []
not_empty_folders = []

for i in range(0,len(folders)):
    path = directory + "//" + folders[i] + "//*.csv"
    files = glob.glob(path)
    if len(files) == 0:
        empty = empty +1
        empty_folders.append(folders[i])
    else:
        not_empty = not_empty + 1
        not_empty_folders.append(folders[i])

print("Empty folders: ", empty, ".")
print("Not empty folders: ", not_empty, ".")

Empty folders:  3136 .
Not empty folders:  8743 .


<br>
### Extracting likes

In [27]:
users = []
active_users = []
likes = glob.glob(directory + "//" + not_empty_folders[20] + "//*.csv")
for j in range(len(likes)):
    df = pd.read_csv(likes[j], delimiter= ";", names = ["page_id", "post_id", "date_stamp", "user_id", "like"])
    l = df["user_id"].tolist()
    users.extend(l)
O = Counter(users)
for k in O.keys():
    if O[k] > 1:
        active_users.append(k)

In [29]:
print(len(active_users), " ", len(users))

7434   55651


In [30]:
page_likes = {}

In [31]:
t1 = time.time()

for i in range(0, len(not_empty_folders)):
    users = []                    # all users who have liked the page
    active_users = []             # users who have liked the page at least twice
    likes = glob.glob(directory + "//" + not_empty_folders[i] + "//*.csv")
    for j in range(0, len(likes)):
        df = pd.read_csv(likes[j], delimiter= ";", names = ["page_id", "post_id", "date_stamp", "user_id", "like"])
        l = df["user_id"].tolist()
        users.extend(l)
    O = Counter(users)
    for k in O.keys():
        if O[k] > 1:
            active_users.append(k)
    key = not_empty_folders[i]
    page_likes[key] = active_users

print(time.time() - t1)

2116.3490059375763


Some pages don't have a single user who liked them at least twice. In these keys the page_likes dictionary has an empty list as value, so that *len(page_likes[key]) = 0*. Filtering out these keys gives us a new dictionary with those pages where there is at least one user who liked them. 

In [24]:
active_users_page_likes = {}

for key in page_likes.keys():
    if len(page_likes[key]) > 0:
        active_users_page_likes[key] = page_likes[key]

len(active_users_page_likes)

8394

In [25]:
f = open("C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_users_page_likes.pkl","wb") # dict{page: users with at least 2 click}
pickle.dump(active_users_page_likes,f)
f.close()

In [26]:
active_page_ids = []

for key in active_users_page_likes.keys():
    active_page_ids.append(key)
    
len(active_page_ids)

8394

In [28]:
np.savetxt("C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_page_ids.csv", active_page_ids, delimiter=",", fmt='%s')

<br>
### Building the sparse matrices

In [2]:
# Open pikle file as dictionary. Sort keys and save the values in a dictionary sorted by keys. 

f = open("C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_users_page_likes.pkl", "rb")
temp = pickle.load(f)
f.close()


d = sorted(temp.keys())

page_likes = {}

for i in range(0, len(d)):
    key = d[i]
    page_likes[key] = temp[key]

In [3]:
# Build the sparse matrix with the number of common likers. Users should have liked both pages at least twice. 

r = 0 # the index of the row of the matrix
c = 0 # the index of the columns of the matrix
row_indices = []
column_indices = []
matrix_elements = []

t1 = time.time()

for key_r in page_likes.keys(): # key_r is the key for the rows
    x = page_likes[key_r]
    for key_c in page_likes.keys(): # key_c is the key for the columns
        if c >= r:
            y = page_likes[key_c]
            common_set = set(x)&set(y)
            common_set_size = len(common_set)
            if common_set_size > 0:
                row_indices.append(r)
                column_indices.append(c)
                matrix_elements.append(common_set_size)
        c = c + 1
    r = r + 1
    c = 0

print((time.time() - t1) / 3600, "hours, ", ((time.time() - t1 - ((time.time() - t1) / 3600)) / 60, "minutes."))

np.savetxt('C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_users_mx_row_indices.csv', row_indices, delimiter=',')
np.savetxt('C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_users_mx_column_indices.csv', column_indices, delimiter=',')
np.savetxt('C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_users_mx_elements.csv', matrix_elements, delimiter=',')

8.273384943736923 hours,  (496.2652068751532, 'minutes.')


The size of the matrix is the number of diagonal elements, that is where row and column indices are equal. (This is also the length of the two dictionaries of page ids and page likes.)

In [4]:
size = 0

for i in range(len(row_indices)):
    if row_indices[i] == column_indices[i]:
        size += 1
size

8394

In [5]:
# Expand the vectors of upper triangle sparse matrix tovectors of the complete symmetric matrix.

row_indices_2 = row_indices
column_indices_2 = column_indices
matrix_elements_2 = matrix_elements

for i in range(0, len(row_indices)):
    if row_indices[i] != column_indices[i]:
        column_indices_2.append(row_indices[i])
        row_indices_2.append(column_indices[i])
        matrix_elements_2.append(matrix_elements[i])

In [6]:
# Build the sparse matrix object. 

row2  = np.array(row_indices_2)
col2  = np.array(column_indices_2)
val2 = np.array(matrix_elements_2)
mx = coo_matrix((val2, (row2, col2)), shape=(size, size))

In [7]:
# Save sparse matrix as npz

from scipy.sparse import save_npz
save_npz('C:\\FOLDERS\\CEU\\Capstone Project\\Data\\active_users_sparse_matrix.npz', mx)

In [8]:
print("Ratio of non-empty relationships is ", round(len(val2) / 8394**2*100, 1), " percent out of the total.")

Ratio of non-empty relationships is  35.7  percent out of the total.
