# Word Graph Clustering

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, EdgesAndLinkedNodes, NodesAndLinkedEdges, Square, LinearColorMapper, BasicTicker, ColorBar
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Blues8, Reds8, Purples8, Oranges8, Viridis8, Spectral8, Inferno256, Viridis256, Category10, RdYlBu3, RdYlBu5, RdYlBu10
from bokeh.transform import linear_cmap, factor_mark, factor_cmap
from bokeh.models import EdgesAndLinkedNodes, NodesAndLinkedEdges

from networkx import edge_betweenness_centrality
from random import random

def most_central_edge(G):
    centrality = edge_betweenness_centrality(G)
    max_cent = max(centrality.values())
    # Scale the centrality values so they are between 0 and 1,
    # and add some random noise.
    centrality = {e: c / max_cent for e, c in centrality.items()}
    # Add some random noise.
    centrality = {e: c + random() for e, c in centrality.items()}
    return max(centrality, key=centrality.get)

output_notebook()


In [None]:
inc_df = pd.read_csv("dtm_all.csv")

In [None]:
listings = pd.read_csv(r"C:\Users\billz\OneDrive-UWaterloo\OneDrive - University of Waterloo\Desktop\Waterloo\Data Open 2021\Datasets-20210712T225538Z-001\Datasets\listings.csv")

In [None]:
id_full_set_tx = set(listings[listings["state"] == "TX"]["id"])
id_full_set_la = set(listings[listings["state"] == "LA"]["id"])
id_full_set_nc = set(listings[listings["state"] == "NC"]["id"])

In [None]:
tx_id_set = id_full_set_tx.intersection(set(inc_df["id"]))
la_id_set = id_full_set_la.intersection(set(inc_df["id"]))
nc_id_set = id_full_set_nc.intersection(set(inc_df["id"]))

In [None]:
inc_df_tx = inc_df.set_index("id").loc[tx_id_set, :]
inc_df_la = inc_df.set_index("id").loc[la_id_set, :]
inc_df_nc = inc_df.set_index("id").loc[nc_id_set, :]

In [None]:
inc_df_tx.reset_index().to_csv("dtm_tx.csv", index = False)
inc_df_la.reset_index().to_csv("dtm_la.csv", index = False)
inc_df_nc.reset_index().to_csv("dtm_nc.csv", index = False)

In [None]:
inc_df_tx = pd.read_csv("dtm_tx.csv")
inc_df_la = pd.read_csv("dtm_la.csv")
inc_df_nc = pd.read_csv("dtm_nc.csv")

In [None]:
col_names_tx = list(inc_df_tx.sum()[inc_df_tx.sum() > 1000].index)
col_names_la = list(inc_df_la.sum()[inc_df_la.sum() > 1000].index)
col_names_nc = list(inc_df_nc.sum()[inc_df_nc.sum() > 1000].index)

In [None]:
inc_df_tx_small = inc_df_tx[col_names_tx]
inc_df_la_small = inc_df_la[col_names_la]
inc_df_nc_small = inc_df_nc[col_names_nc]

In [None]:
inc_df_tx_small.to_csv("dtm_tx_small.csv", index = False)
inc_df_la_small.to_csv("dtm_la_small.csv", index = False)
inc_df_nc_small.to_csv("dtm_nc_small.csv", index = False)

In [None]:
inc_df_tx_small_sample = inc_df_tx_small.sample(frac = 0.2)
inc_df_la_small_sample = inc_df_la_small.sample(frac = 0.2)
inc_df_nc_small_sample = inc_df_nc_small.sample(frac = 0.2)

In [None]:
target = pd.read_csv("feedback_score_3.csv")
target = target.iloc[:, 1:]

In [None]:
inc_df_tx_small_sample["rating"] =target.set_index("id").loc[inc_df_tx_small_sample.index, "feedback_score"].values
inc_df_la_small_sample["rating"] =target.set_index("id").loc[inc_df_la_small_sample.index, "feedback_score"].values
inc_df_nc_small_sample["rating"] =target.set_index("id").loc[inc_df_nc_small_sample.index, "feedback_score"].values

In [None]:
inc_df_tx_small_sample = inc_df_tx_small_sample.dropna()
inc_df_la_small_sample = inc_df_la_small_sample.dropna()
inc_df_nc_small_sample = inc_df_nc_small_sample.dropna()

In [None]:
adj_mat_tx_small_sample = np.dot(inc_df_tx_small_sample.iloc[:,:-1].values, inc_df_tx_small_sample.iloc[:,:-1].values.T)
adj_mat_la_small_sample = np.dot(inc_df_la_small_sample.iloc[:,:-1].values, inc_df_la_small_sample.iloc[:,:-1].values.T)
adj_mat_nc_small_sample = np.dot(inc_df_nc_small_sample.iloc[:,:-1].values, inc_df_nc_small_sample.iloc[:,:-1].values.T)

In [None]:
G_tx = nx.from_numpy_array(adj_mat_tx_small_sample)
G_la = nx.from_numpy_array(adj_mat_la_small_sample)
G_nc = nx.from_numpy_array(adj_mat_nc_small_sample)

In [None]:
from networkx.algorithms.community import greedy_modularity_communities, asyn_fluidc

In [None]:
def create_spectral_clustering(G, state_name):
    communities = asyn_fluidc(G, 5)
    # Create empty dictionaries
    fluid_class = {}
    fluid_color = {}
    fluid_size = {}
    #Loop through each community in the network
    for community_number, community in enumerate(communities):
        #For each member of the community, add their community number and a distinct color
        for name in community: 
            fluid_class[name] = community_number
            fluid_color[name] = RdYlBu5[community_number]
            fluid_size[name] = community_number * 5
    nx.set_node_attributes(G, fluid_class, 'fluid_class')
    nx.set_node_attributes(G, fluid_color, 'fluid_color')
    nx.set_node_attributes(G, fluid_size, 'fluid_size')
    degrees = dict([(node, degree) for node, degree in nx.degree(G)])
    nx.set_node_attributes(G, name='degree', values=degrees)
    index_rating_size = dict([(node, rating * 20) for node, rating in zip(G.nodes, inc_df_tx_small_sample["rating"])])
    index_rating = dict([(node, rating) for node, rating in zip(G.nodes, inc_df_tx_small_sample["rating"])])
    nx.set_node_attributes(G, values = index_rating_size, name = 'index_rating_size')
    nx.set_node_attributes(G, values = index_rating, name = 'index_rating')

    ## Greedy clustering
    title = f'Listings Graph by Comments Keyword Edges ({state_name})'

    size_by_this_attribute = "index_rating_size"
    color_by_this_attribute = 'fluid_color'

    HOVER_TOOLTIPS = [
        ("Sentiment Index", "@index_rating"),
        ("Fluid Class", "@fluid_class"),
        ("Fluid Color", "$color[swatch]:fluid_color")
    ]
    plot = figure(tooltips = HOVER_TOOLTIPS,
                tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
                x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)
    network_graph = from_networkx(G, nx.spectral_layout, scale = 10, center=(0, 0), weight = 1)
    network_graph.node_renderer.glyph = Circle(
        size=size_by_this_attribute, fill_color=color_by_this_attribute,
        line_color="#ffffff00",
        fill_alpha=0.7)
    network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=0.05)
    plot.renderers.append(network_graph)
    save(plot, filename=f"{title}.html")

    title = f'Listings Graph by Comments Keyword Edges ({state_name})'
    size_by_this_attribute = "degree"
    color_by_this_attribute = "index_rating"

    HOVER_TOOLTIPS = [
        ("Sentiment Index", "@index_rating"),
        ("Fluid Class", "@fluid_class"),
        ("Fluid Color", "$color[swatch]:fluid_color")
    ]
    color_palette = Viridis256
    network_graph = from_networkx(G, nx.spectral_layout, scale=10, center=(0, 0), weight = 1)
    minimum_value_color = min(network_graph.node_renderer.data_source.data[color_by_this_attribute])
    maximum_value_color = max(network_graph.node_renderer.data_source.data[color_by_this_attribute])
    color_mapper = LinearColorMapper(palette=color_palette, low=0, high=1)
    color_bar = ColorBar(color_mapper=color_mapper, ticker= BasicTicker(),
                        location=(0,0))
    plot = figure(tooltips = HOVER_TOOLTIPS,
                tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
                x_range=Range1d(-10.1, 10.1), y_range=Range1d(-10.1, 10.1), title=title)

    plot.add_layout(color_bar, 'right')
    network_graph.node_renderer.glyph = Circle(
        size=size_by_this_attribute, 
        fill_color=linear_cmap(color_by_this_attribute, color_palette, minimum_value_color, maximum_value_color),
        line_color="#ffffff00",
        fill_alpha=0.5
    )

    network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=0.05)
    plot.renderers.append(network_graph)
    save(plot, filename=f"{title} (inverse).html")



In [None]:
create_spectral_clustering(G_tx, "Austin")

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "line_color" value "#ffffff00" [renderer: GlyphRenderer(id='40699', ...)]
ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "line_color" value "#ffffff00" [renderer: GlyphRenderer(id='40942', ...)]


In [None]:
create_spectral_clustering(G_la, "New Orleans")

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "line_color" value "#ffffff00" [renderer: GlyphRenderer(id='38427', ...)]
ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "line_color" value "#ffffff00" [renderer: GlyphRenderer(id='38670', ...)]


In [None]:
create_spectral_clustering(G_nc, "Ashville")

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "line_color" value "#ffffff00" [renderer: GlyphRenderer(id='41267', ...)]
ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name. This could either be due to a misspelling or typo, or due to an expected column being missing. : key "line_color" value "#ffffff00" [renderer: GlyphRenderer(id='41510', ...)]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8e6d3dd7-a3ce-422e-8f25-57163d804f84' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>