In [1]:
fname = 'Data/user_movie.csv'
state = 1337
drop_cols = ['userId']

u_clusters = 15
i_clusters = 30

# User Clustering

In [6]:
def u_cluster(fname, model_fname, random_state=state, drop_cols=drop_cols, 
              u_clusters=20, u_method='kmeans', **kwargs):
    """
    pre_cluster

    Perform item-wise and user-wise clustering


    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns be 

    random_state : int
                   The state to be used by the clustering algorithm to ensure
                   the consistency of results across runs

    drop_cols    : list
                   Columns to be dropped in fname

    u_clusters   : int
                   Number of clusters to be used for hard clustering of users

    Returns
    -------
    utility_matrix : pandas DataFrame
    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering,
                                 AgglomerativeClustering, DBSCAN, OPTICS,
                                 cluster_optics_dbscan, Birch)
    import pickle
    # Aggregation through tables

    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    if u_method == 'kmeans':
        u_clusterer = KMeans(n_clusters=u_clusters, random_state=state)
    if u_method == 'spectral':
        u_clusterer = SpectralClustering(u_clusters, random_state=state)
    if u_method == 'ward':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              **kwargs)
    if u_method == 'single':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='single', **kwargs)
    if u_method == 'complete':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='complete', **kwargs)
    if u_method == 'average':
        u_clusterer = AgglomerativeClustering(n_clusters=u_clusters,
                                              linkage='average', **kwargs)
    if u_method == 'dbscan':
        u_clusterer = DBScan(**kwargs)
    if u_method == 'optics':
        u_clusterer = OPTICS(**kwargs)
    if u_method == 'birch':
        u_clusterer = Birch(n_clusters=u_clusters, **kwargs)

    u_predict = u_clusterer.fit_predict(df)
    df['u_cluster'] = u_predict

    model = u_clusterer
    result = dict(df['u_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result

# Item Clustering

In [9]:
def i_cluster(fname, model_fname, random_state=state, drop_cols=drop_cols,
              i_clusters=20, i_method='kmeans', **kwargs):
    """
    pre_cluster

    Perform item-wise and user-wise clustering


    fname        : pandas DataFrame or string
                   The initial utility matrix with each row corresponding 
                   to a user and the columns be 

    random_state : int
                   The state to be used by the clustering algorithm to ensure
                   the consistency of results across runs

    drop_cols    : list
                   Columns to be dropped in fname

    i_clusters   : int
                   Number of clusters to be used for hard clustering of items

    Returns
    -------
    model         : sklearn model

    result        : dict

    """
    import pandas as pd
    import numpy as np
    from sklearn.cluster import (KMeans, SpectralClustering, 
                                 AgglomerativeClustering, DBSCAN, OPTICS, 
                                 cluster_optics_dbscan, Birch)
    import pickle
    if isinstance(fname, str):
        df = pd.read_csv(fname)
    else:
        df = fname

    if drop_cols != None:
        df = df.drop(columns=drop_cols)

    df_items = df.T

    if i_method == 'kmeans':
        i_clusterer = KMeans(n_clusters=i_clusters, random_state=state)
    if i_method == 'spectral':
        i_clusterer = SpectralClustering(i_clusters, random_state=state)
    if i_method == 'ward':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              **kwargs)
    if i_method == 'single':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='single', **kwargs)
    if i_method == 'complete':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='complete', **kwargs)
    if i_method == 'average':
        i_clusterer = AgglomerativeClustering(n_clusters=i_clusters,
                                              linkage='average', **kwargs)
    if i_method == 'dbscan':
        i_clusterer = DBScan(**kwargs)
    if i_method == 'optics':
        i_clusterer = OPTICS(**kwargs)
    if i_method == 'birch':
        i_clusterer = Birch(n_clusters=i_clusters, **kwargs)

    i_predict = i_clusterer.fit_predict(df_items)
    df_items['i_cluster'] = i_predict

    model = i_clusterer
    result = dict(df_items['i_cluster'])
    with open(model_fname,'wb') as f:
        pickle.dump(model, f)
    return model, result

In [10]:
x_u,y_u = u_cluster(fname,'u_cluster.pkl', u_method='ward')
x_i,y_i = i_cluster(fname,'i_cluster.pkl', u_method='ward')

# Code for aggregating

In [None]:
u_agg='sum', i_agg='sum'

#     print('Appended User Clusters')
#     display(df)
    
#     print('Appended Item Clusters')
#     display(df_items)

#     u_ids = list(range(u_clusters))
#     i_ids = list(range(i_clusters))

#     u_series = df['u_cluster']
#     i_series = df_items['i_cluster']

#     u_feats = {}
#     for u_id in u_ids:
#         sub_df = df.groupby('u_cluster').get_group(
#             u_id).drop(columns=['u_cluster']).T
#         sub_df = sub_df.merge(i_series, left_index=True, right_index=True)
        
#         display(sub_df)

#         if u_agg == 'sum':
#             df_grp = sub_df.groupby('i_cluster').sum()
#         if u_agg == 'mean':
#             df_grp = sub_df.groupby('i_cluster').mean()
#         if not isinstance(u_agg,str):
#             df_grp = sub_df.groupby('i_cluster').apply(u_agg)
            
#         print(u_id, 'User Cluster')
#         display(df_grp)
            
#         if i_agg == 'sum':
#             df_grp = df_grp.sum(axis=1)
#         if i_agg == 'mean':
#             df_grp = df_grp.mean(axis=1)
#         if not isinstance(i_agg,str):
#             df_grp = df_grp.apply(i_agg, axis=1)
            
#         print(u_id, "User Cluster's Features")
#         display(df_grp)

#         u_feats[u_id] = df_grp

#     u_matrix = pd.DataFrame()
#     for k, v in u_feats.items():
#         u_matrix = u_matrix.merge(v.rename(k), how='outer',
#                                   left_index=True, right_index=True)

#     utility_matrix = u_matrix.fillna(0).T
#     utility_matrix.index.rename('u_cluster', inplace=True)
#     print('Result')
#     return utility_matrix