In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import networkx as nx
import json
import numpy as np

In [None]:
%load_ext autotime

In [None]:
path = r'D:\Yelp\yelp_academic_dataset_user.json'

In [None]:
data = []
for line in open(path, 'r', encoding="utf-8"):
    data.append(json.loads(line))

In [None]:
df = pd.DataFrame(data)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df = df.set_index('user_id')

In [None]:
def create_directed_graph(df, n):
    
    G = nx.DiGraph()
    i = 0
    for index, row in df.iterrows():
        if i == n:
            break
        user_id = index
        friends = row.friends.replace(' ', '').split(",")
        edges = [(user_id, friend) for friend in friends]
        for e in edges:
            u = e[0]
            v = e[1]
            G.add_edge(u, v) 
            G.add_edge(v, u) 
        i+=1
    print('Nodes: ' + str(len(G.nodes)))
    return G

In [None]:
def create_graph(df, n):
    
    G = nx.Graph()
    i = 0
    for index, row in df.iterrows():
        if i == n:
            break
        user_id = index
        friends = row.friends.replace(' ', '').split(",")
        edges = [(user_id, friend) for friend in friends]
        for e in edges:
            u = e[0]
            v = e[1]
            G.add_edge(u, v) 
        i+=1
    print('Nodes: ' + str(len(G.nodes)))
    return G

In [None]:
def build_clique_graph(cliques):
    G = nx.Graph()
    
    for clique in cliques:
        edges = list(itertools.permutations(clique, 2))
        for e in edges:
            u = e[0]
            v = e[1]
            G.add_edge(u, v)
            G.add_edge(v, u)
    print('Nodes: ' + str(len(G.nodes)))
    return G

In [None]:
def get_not_existing_users(df):
    
    def get_friendlist_user(friends):
        result = []
        for string in friends: 
            friends = string.replace(' ', '').split(",")
            result.extend(friends)
        return result
    
    existing_user_ids = set(df.index.values)
    potentiol_missing_user_ids = set((get_friendlist_user(df.friends.values))) 
    
    not_existing_user = list(existing_user_ids.difference(potentiol_missing_user_ids))
    
    return not_existing_user
    

In [None]:
def delete_nodes(G, not_existing_user):
    print('Nodes before: ' + str(len(G.nodes)))
        
    for user in not_existing_user:
        try:
            G.remove_node(user)
        except:
            None
            
    print('Nodes after: ' + str(len(G.nodes)))
    return G

In [None]:
def find_cliques(G, n):
    #cliques = list(nx.enumerate_all_cliques(G))
    result = []
    cliques = list(nx.find_cliques(G))
    for clique in cliques:
        if len(clique) > n:
            result.append(clique)
        
        
    print(len(result))
    return result

In [None]:
def get_clique_dict(cliques):
    clique_dict = {}

    for clique in cliques:
        len_clique = len(clique)
        for node in clique:
            clique_dict[node] = {'group_len': len_clique}
    return clique_dict  

In [None]:
def add_group_number(df, clique_dict):
    nodes = clique_dict.keys()
    df['group'] = 0
    for node in nodes:  
        df.at[node, 'group'] = clique_dict[node]['group_len']
    return df

In [None]:
def add_ssc(df, scc):
    df['scc'] = 0
    for node in scc:  
        df.at[node, 'scc'] = 1
    return df

In [None]:
def get_not_existing_user_of_graph(G, df):
    nodes = G.nodes
    result = []
    for node in nodes:
        try:
            df.loc[node]
        except:
            result.append(node)
    print('nodes to delete: ' + str(len(nodes)))
    return result

In [None]:
# n means how many users should be analyzed
# G = create_graph(df, n=100000)
G = create_directed_graph(df, n=len(df)-1)

In [None]:
not_existing_user = get_not_existing_user_of_graph(G, df)

In [None]:
G = delete_nodes(G, not_existing_user)

In [None]:
scc = list(nx.kosaraju_strongly_connected_components(G))
print(len(scc))

nx.draw(G_cliques)

clique_dict = get_clique_dict(cliques)

df_analyse = add_group_number(df, clique_dict)

In [None]:
df_analyse = add_ssc(df, scc)

df_analyse.group.value_counts().plot()

In [None]:
df_analyse.scc.value_counts().plot()

df_analyse.groupby('group').review_count.median().plot()

In [None]:
df_analyse.groupby('scc').review_count.median().plot(kind='bar')