# Post-Processing Exploration
**Purpose:** 

This notebook explores different post-processing methods to evaluate the clustering outputs from the RecSys

**Methodology:**

The notebook assumes input from the RecSys. It will explore the timeseries cluster probabilities to evaluate the dynamics/change in users.

**Author:**

Prajna Soni (@prajnasoni)

In [6]:
# Import the libraries we will be using

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import logging

from sklearn import metrics


In [7]:
# Test cluster dataset creation
# Append cluster porbabilities in a 3D Array [user, cluster_probas, sim_time]
A = np.array([[0.1,0.8,0.1],[0.9,0.0,0.1]])
print(A)

[[0.1 0.8 0.1]
 [0.9 0.  0.1]]


In [8]:
A.shape

(2, 3)

In [9]:
B = np.array([[0.3,0.3,0.4],[0.5,0.1,0.4]])
print(B)

[[0.3 0.3 0.4]
 [0.5 0.1 0.4]]


In [10]:
np.atleast_3d(A).shape

(2, 3, 1)

In [11]:
B.shape

(2, 3)

In [12]:
stack = np.dstack((np.atleast_3d(A),np.atleast_3d(B)))
stack.shape

(2, 3, 2)

In [14]:
stack

array([[[0.1, 0.3],
        [0.8, 0.3],
        [0.1, 0.4]],

       [[0.9, 0.5],
        [0. , 0.1],
        [0.1, 0.4]]])

In [13]:
np.dstack((np.atleast_3d(stack),np.atleast_3d(B)))

array([[[0.1, 0.3, 0.3],
        [0.8, 0.3, 0.3],
        [0.1, 0.4, 0.4]],

       [[0.9, 0.5, 0.5],
        [0. , 0.1, 0.1],
        [0.1, 0.4, 0.4]]])

In [41]:
stack[:,:,1]

array([[0.1, 0.8, 0.1],
       [0.9, 0. , 0.1]])

In [19]:
import sys
import os
import pandas as pd
import scipy.sparse

### IMPORTANT ###
# Make sure you are correctly appending the path
# Otherwise the imports will not work!
sys.path.append("/Users/pvs262/Documents/rec-sys-dynamics/code")
from src.analysis.cluster import movielens, cluster, analysis
from src.algorithm.cosin import CosinSimilarity
import src.Datasets.movielens_recreate as dataset

def getRatingsData():
    return pd.read_parquet(os.path.dirname(dataset.__file__)+'/ratings.parquet.gzip')

# set up UI matrix
ratings = getRatingsData()
algo = CosinSimilarity()
algo.fit(ratings)
UI = pd.DataFrame.sparse.from_spmatrix(algo.rating_matrix_)

# get probabilities
cluster = cluster(UI)
cluster.svd(3)
proba = cluster.gmm(n=3,covariance_type="full",df='proba')
proba

[     cluster       proba_C0       proba_C1       proba_C2
 1          2  8.011363e-165  8.296268e-110   1.000000e+00
 2          2   2.909687e-89   2.039657e-48   1.000000e+00
 3          2  1.013579e-124   2.420996e-77   1.000000e+00
 4          2  3.090011e-199  4.062939e-131   1.000000e+00
 5          2  5.734951e-156   3.532983e-88   1.000000e+00
 ..       ...            ...            ...            ...
 939        1  5.790513e-193   1.000000e+00  7.487980e-142
 940        1  1.356655e-145   1.000000e+00   2.439556e-99
 941        1  1.436682e-142   1.000000e+00  7.043861e-107
 942        1  2.494901e-159   1.000000e+00  9.180685e-112
 943        1  2.648988e-132   1.000000e+00   2.537075e-88
 
 [943 rows x 4 columns],
 None]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0
2,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,5.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0
939,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,...,4.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
940,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0
941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0
