This notebook continues from VP Gilbert's output in clustering. I just modified his outputs in functions __u_cluster__ and __i_cluster__ to include the dataframes. The addditions to his notebook start from function __cluster_assignment__.

September 21, 2021

In [1]:
fname = 'user_movie.csv'
state = 1337
drop_cols = ['userId']

u_clusters = 15
i_clusters = 30

# User Clustering

In [2]:
def u_cluster(fname, model_fname, random_state=state, drop_cols=drop_cols, 
              u_clusters=20, u_method='kmeans', **kwargs):
    """
    pre_cluster

    Perform item-wise and user-wise clustering


    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns be 

    random_state : int
                   The state to be used by the clustering algorithm to ensure
                   the consistency of results across runs

    drop_cols    : list
                   Columns to be dropped in fname

    u_clusters   : int
                   Number of clusters to be used for hard clustering of users

    Returns
    -------
    utility_matrix : pandas DataFrame
    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering,
                                 AgglomerativeClustering, DBSCAN, OPTICS,
                                 cluster_optics_dbscan, Birch)
    import pickle
    # Aggregation through tables

    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    if u_method == 'kmeans':
        u_clusterer = KMeans(n_clusters=u_clusters, random_state=state)
    if u_method == 'spectral':
        u_clusterer = SpectralClustering(u_clusters, random_state=state)
    if u_method == 'ward':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              **kwargs)
    if u_method == 'single':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='single', **kwargs)
    if u_method == 'complete':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='complete', **kwargs)
    if u_method == 'average':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='average', **kwargs)
    if u_method == 'dbscan':
        u_clusterer = DBScan(**kwargs)
    if u_method == 'optics':
        u_clusterer = OPTICS(**kwargs)
    if u_method == 'birch':
        u_clusterer = Birch(n_clusters=u_clusters, **kwargs)

    u_predict = u_clusterer.fit_predict(df)
    df['u_cluster'] = u_predict

    model = u_clusterer
    result = dict(df['u_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result, df

# Item Clustering

In [3]:
def i_cluster(fname, model_fname, random_state=state, drop_cols=drop_cols,
              i_clusters=20, i_method='kmeans', **kwargs):
    """
    pre_cluster

    Perform item-wise and user-wise clustering


    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns be 

    random_state : int
                   The state to be used by the clustering algorithm to ensure
                   the consistency of results across runs

    drop_cols    : list
                   Columns to be dropped in fname

    i_clusters   : int
                   Number of clusters to be used for hard clustering of items

    Returns
    -------
    model         : sklearn model

    result        : dict

    """
    
    
    
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering, 
                                 AgglomerativeClustering, DBSCAN, OPTICS, 
                                 cluster_optics_dbscan, Birch)
    import pickle
    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    df_items = df.T

    if i_method == 'kmeans':
        i_clusterer = KMeans(n_clusters=i_clusters, random_state=state)
    if i_method == 'spectral':
        i_clusterer = SpectralClustering(i_clusters, random_state=state)
    if i_method == 'ward':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              **kwargs)
    if i_method == 'single':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='single', **kwargs)
    if i_method == 'complete':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='complete', **kwargs)
    if i_method == 'average':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='average', **kwargs)
    if i_method == 'dbscan':
        i_clusterer = DBScan(**kwargs)
    if i_method == 'optics':
        i_clusterer = OPTICS(**kwargs)
    if i_method == 'birch':
        i_clusterer = Birch(n_clusters=i_clusters, **kwargs)

    i_predict = i_clusterer.fit_predict(df_items)
    df_items['i_cluster'] = i_predict

    model = i_clusterer
    result = dict(df_items['i_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result, df_items

In [4]:
x_u,y_u, df_u = u_cluster(fname,'u_cluster.pkl', u_method='ward')
x_i,y_i, df_i = i_cluster(fname,'i_cluster.pkl', u_method='ward')

# Cluster Assignments

In [11]:
def cluster_assignment(cluster_res, col='user_id'):
    """
    Converts the dictionary containing user_id and user_cluster assignment  
    to a pandas DataFrame.

    cluster_res : dictionary
                  Result from clustering function with keys being the
                  user_id and values their cluster membership

    col         : string
                  Column name of the user or item

    Returns
    -------
    result      : pandas DataFrame
                  Two columns representing the user/item and their 
                  corresponding cluster assignments
    """
    import pandas as pd

    if col == 'user_id':
        cluster_name = 'ucluster'
    else:
        cluster_name = 'icluster'

    c_assignment = pd.DataFrame(list(dictionary.items()),
                                columns=[data_name, cluster_name])
    c_assignment.set_index(data_name, inplace=True)
    return c_assignment

In [12]:
uc_assignment = cluster_assignment(y_u, data_name='user_id')
ic_assignment = cluster_assignment(y_i, data_name='item_id')

In [14]:
ic_assignment

Unnamed: 0_level_0,icluster
item_id,Unnamed: 1_level_1
1,3
2,10
3,2
4,1
5,2
...,...
193581,1
193583,1
193585,1
193587,1


# Aggregation

In [8]:
def util_mat_agg(df_u, df_i, u_agg='sum', i_agg='sum'):
    """
    Aggregates the results of the clustering with respect to item clusters and user clusters.
    
    Parameters
    ----------
    df_u    : pandas DataFrame
              Dataframe of the user_ids with cluster assignments and
              item ratings of users from the original matrix
    
    df_i    : pandas DataFrame
              Dataframe of the item_ids with cluster assignments and 
              item ratings of users from the original matrix
    
    u_agg   : str
              Aggregration method to be used for users through 'sum' or 'mean' 
    
    i_agg   : str
              Aggregration method to be used for items through 'sum' or 'mean' 
              
    Returns
    -------
    util_mat   : pandas DataFrame
                 utility matrix consisting of the aggregrated user 
                 clusters as rows and aggregated item clusters as columns
    """
    import numpy as np
    import pandas as pd

    u_series = df_u['u_cluster']
    i_series = df_i['i_cluster']

    u_ids = np.unique(i_series.values)
    i_ids = np.unique(i_series.values) 

    u_feats = {}
    for u_id in u_ids: #u_ids are clusters of u_id
        sub_df = df_u.groupby('u_cluster').get_group(
            u_id).drop(columns=['u_cluster']).T
        sub_df = sub_df.merge(i_series, left_index=True, right_index=True)

        if u_agg == 'sum':
            df_grp = sub_df.groupby('i_cluster').sum()
        if u_agg == 'mean':
            df_grp = sub_df.groupby('i_cluster').mean()
        if not isinstance(u_agg,str):
            df_grp = sub_df.groupby('i_cluster').apply(u_agg)

        if i_agg == 'sum':
            df_grp = df_grp.sum(axis=1)
        if i_agg == 'mean':
            df_grp = df_grp.mean(axis=1)
        if not isinstance(i_agg,str):
            df_grp = df_grp.apply(i_agg, axis=1)

        u_feats[u_id] = df_grp
    

    u_matrix = pd.DataFrame()
    for k, v in u_feats.items():
        u_matrix = u_matrix.merge(v.rename(k), how='outer',
                                  left_index=True, right_index=True)

    util_mat = u_matrix.fillna(0).T
    util_mat.index.rename('u_cluster', inplace=True)
    return util_mat

In [9]:
utility_matrix_agg(df_u, df_i, 'sum', 'sum')

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,96,174,256,60,58,94,53,223,247,30,56,28,33,180,89,80,142,63,18,584
1,205,1074,224,278,254,528,431,334,627,133,122,131,239,423,406,200,985,146,105,134
2,377,423,124,58,61,5,0,47,110,25,35,166,24,214,32,54,0,94,8,63
3,1190,660,762,304,164,7,0,19,96,142,329,322,106,1046,38,307,1,304,17,225
4,582,554,1059,345,291,258,83,693,1131,186,407,322,101,883,462,502,85,301,110,415
5,761,1359,601,1047,1139,1082,927,623,1452,629,409,499,332,1027,1116,625,1017,296,483,276
6,36,306,23,28,29,71,40,104,126,13,5,19,38,42,42,10,158,86,9,117
7,1777,1065,815,734,440,64,32,196,471,353,477,822,115,1682,201,616,45,330,80,250
8,295,501,214,31,36,85,41,311,155,17,53,260,61,154,43,49,93,198,9,92
9,545,1031,756,628,558,397,266,669,1104,345,473,379,166,916,728,694,309,181,211,231
