In [None]:
import os
import gc
import shutil
import datetime
import prince

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import models.utils as ut

import ipywidgets as widgets
from IPython.display import display

from branca.colormap import linear

ut.cluster.hv.extension("plotly")

%load_ext autoreload
%autoreload 2

In [None]:
plt.rcParams["figure.figsize"] = (15, 5)

In [None]:
#Set color objects
colors = ["red", "blue", "green", "orange", "black", "purple", "cyan"]

colormaps = {}
for key in linear.__dict__["_schemes"].keys():
    scheme = getattr(linear, key, None)
    colormaps[key] = scheme

clusterColormaps = [colormaps["YlOrBr_03"], colormaps["Blues_03"], colormaps["Greens_08"], 
                    colormaps["RdPu_04"], colormaps["Purples_04"], colormaps["Reds_08"], 
                    colormaps["GnBu_03"]]

clusterColorscales = ["Oranges", "Blues", "Greens", "Magenta", "Purples", "Reds", "Teal"]

# Import data

In [None]:
filePath = r"..\data\NewYorkShootingCrimes.csv"
fullData = pd.read_csv(filePath, header=0, sep=",")
fullData.head(3)

In [None]:
newYorkCoord = (40.7128, -74.0060)

# Load the geojson data
precinctGeoJson = ut.geo.parseGeoJsonFile(r"..\data\NewYorkCityPolicePrecincts.geojson", ["properties", "precinct"])
zipGeoJson = ut.geo.parseGeoJsonFile(r"..\data\NewYorkCityZipCodes.geojson", ["properties", "ZIPCODE"])
neGeoJson = ut.geo.parseGeoJsonFile(r"..\data\NewYorkCityNeighborhoods.geojson", ["properties", "ntaname"])

geoJsons = {"PRECINCT": precinctGeoJson, "postcode": zipGeoJson, "neighbourhood": neGeoJson}

In [None]:
def prependZeros(string, length):
    ''' Prepend zeros to the given string if it <= than the given length '''
    if (len(string) <= length):
        return ("0" * (length - len(string))) + string
    return string

def getWeekDay(row: pd.Series):
    ''' '''
    weekdays = ["0_Monday", "1_Tuesday", "2_Wednesday", "3_Thursday", "4_Friday", "5_Saturday", "6_Sunday"]
    weekdayIndex = datetime.datetime(row["OCCUR_YEAR"], row["OCCUR_MONTH"], row["OCCUR_DATE"]).weekday()
    return weekdays[weekdayIndex]

In [None]:
# Drop these unnecessary columns
data = fullData.drop(["INCIDENT_KEY", "X_COORD_CD", "Y_COORD_CD",
                      "house_number", "city", "residential", "building", "city_district",
                      "town", "village", "historic", "office", "hamlet", "county", "country",
                      "tourism", "shop", "leisure", "natural", "commercial", "retail", "industrial",
                      "road", "suburb", "state", "country_code", "amenity", "railway",
                      "man_made", "place", "aeroway", "municipality"], axis=1)

# Insert a new column which is the weekday of each row
data.insert(loc=6, column="WEEKDAY", value=data.apply(getWeekDay, axis=1))

# Convert these columns to type string so that the algorithms can treat them as categorical data
data = data.astype({"JURISDICTION_CODE": str, "PRECINCT": str, "STATISTICAL_MURDER_FLAG": str, 
                    "OCCUR_MONTH": str, "OCCUR_DATE": str, "OCCUR_HOUR": str, 
                    "OCCUR_MINUTE": str, "OCCUR_SECOND": str, "postcode": str})

# Prepend zero(s) to each value in these columns, so that they be sorted correctly
data["OCCUR_MONTH"]         = data["OCCUR_MONTH"].apply(lambda x: prependZeros(x, 2))
data["OCCUR_DATE"]          = data["OCCUR_DATE"].apply(lambda x: prependZeros(x, 2))
data["OCCUR_HOUR"]          = data["OCCUR_HOUR"].apply(lambda x: prependZeros(x, 2))
data["OCCUR_MINUTE"]        = data["OCCUR_MINUTE"].apply(lambda x: prependZeros(x, 2))
data["OCCUR_SECOND"]        = data["OCCUR_SECOND"].apply(lambda x: prependZeros(x, 2))
data["JURISDICTION_CODE"]   = data["JURISDICTION_CODE"].apply(lambda x: prependZeros(x, 2))

print(data.shape)

In [None]:
ut.metrics.printCollectionInColumns(data.columns, 3)

---

In [None]:
# %%time
# def foo():
#     import traceback

#     addressDf = pd.DataFrame()
    
#     try:
#         print(f"Number of rows: {data.shape[0]}")
#         for i in range(data.shape[0]):
#             location = geolocator.reverse(f"{fullData.iloc[i].Latitude}, {fullData.iloc[i].Longitude}")
#             address = location.raw["address"]
#             print(f"[{i+1}] ---", location.address)
            
#             # Write to data frame
#             for addressPart, value in address.items():
#                 addressDf.loc[i, addressPart] = value
            
#             time.sleep(1)
#     except Exception as ex:
#         traceback.print_exc()

#     addressDf.to_csv(r"..\data\addresses.csv", index=False, mode="w")
#     return addressDf

# foo()

# Filter data by year(s)

In [None]:
columns = ['OCCUR_MONTH',
           'OCCUR_HOUR',
           'PRECINCT',
           'WEEKDAY',
           'Latitude',
           'Longitude',
           'postcode']
        
allCatColumns = data.select_dtypes(include=["object", "bool", "category"]).columns.values.tolist()

In [None]:
def getLocations(dataFrame: pd.DataFrame, lat: str, lon: str) -> [tuple]:
    ''' '''
    coordDf = dataFrame[[lat, lon]]
    return [tuple(r) for r in coordDf.to_numpy()]

In [None]:
years = None
yearsStr = None
df = None
resultsPath = None

@widgets.interact_manual
def filterByYear(fromYear=sorted(data["OCCUR_YEAR"].unique()), toYear=sorted(data["OCCUR_YEAR"].unique())):
    ''' '''
    if (fromYear > toYear):
        raise ValueError("fromYear must be less than toYear")

    global years
    global yearsStr
    global df
    global resultsPath
    
    years = np.arange(fromYear, toYear + 1, 1)
    yearsStr = f"{fromYear}" if (len(years) == 1) else f"{fromYear}-{toYear}"

    df = data[data["OCCUR_YEAR"].isin(years)].reset_index(drop=True)
    df.drop("OCCUR_YEAR", axis=1, inplace=True)
    
    # Construct results path
    resultsPath = os.path.abspath(os.path.join("..", "results", yearsStr))
    if (not os.path.exists(resultsPath)):
        os.makedirs(resultsPath)
    print("Results will be stored in", resultsPath)
    
    return df.shape

In [None]:
@widgets.interact_manual
def plotHeatmapYear(saveMap=False):
    # Plot heat map
    locations = getLocations(df, "Latitude", "Longitude")
    worldMap = ut.geo.plotHeatmap(locations, f"Year {yearsStr}", center=newYorkCoord)
    
    if (saveMap):
        mapFilePath = os.path.join(resultsPath, "heatmap.html")
        worldMap.save(mapFilePath)
        print("Saved to ", mapFilePath)
    
    return worldMap

## MCA

In [None]:
x = df[allCatColumns][["neighbourhood", "postcode"]]

In [None]:
mca = prince.MCA(n_components=10, 
                 n_iter=3, 
                 copy=True, 
                 check_input=True,
                engine="auto",
                random_state=42)
mca = mca.fit(x)

In [None]:
help(mca.plot_coordinates)

In [None]:
ax = mca.plot_coordinates(X=x,
                          ax=None,
                          figsize=(10, 10),
                          show_row_points=False,
                          row_points_size=10,
                          show_row_labels=False,
                          show_column_points=True,
                          column_points_size=30,
                          show_column_labels=False,
                          legend_n_cols=2)

In [None]:
mca.total_inertia_, mca.explained_inertia_

# Data Exploration

In [None]:
explPath = os.path.join(resultsPath, "exploration")
if (not os.path.exists(explPath)):
    os.makedirs(explPath)
print(explPath)

In [None]:
ut.geo.plotScatterOnMap(df, "Latitude", "Longitude", {"color": "PRECINCT", "zoom": 10})

### Cramers V & Theil Uncertainty

In [None]:
@widgets.interact_manual
def correlation(dataFrame=widgets.fixed(df), correlationMethod=["Cramers V", "Theil Uncertainty"],
                width=(5, 100, 1), height=(5, 100, 1), saveFigure=False):
    ''' '''
    correlationFunc = "cramersV" if (correlationMethod == "Cramers V") else "theilU"
    plt.subplots(figsize=(width, height))
    
    # Compute correlation values
    correlationDf = ut.metrics.getCorrelationDf(dataFrame, correlationFunc)
    ax = sns.heatmap(correlationDf, annot=True)
    ax.set_title(yearsStr)
    
    if (saveFigure):
        correlationFilePath = os.path.join(explPath, f"{correlationMethod}.png")
        plt.savefig(correlationFilePath)
        print("Saved to", correlationFilePath)

### Heatmap

In [None]:
ut.cluster.hv.extension("bokeh")

In [None]:
@widgets.interact_manual
def heatmap(dataFrame=widgets.fixed(df), c1=allCatColumns, c2=allCatColumns, 
            width=(200, 1200, 50), height=(200, 1200, 50), saveFigure=False, **kwargs):
    ''' '''
    commonArgs = [c1, c2, data[c1].unique(), data[c2].unique()]
    commonKwArgs = {"width": width, "height": height, "cmap": "magma", "tools": ["hover"]}
    layout = ut.heatmap.freqHeatMap(dataFrame, title=f"Frequency {c1} by {c2}", *commonArgs, **commonKwArgs)
    
    if (saveFigure):
        heatmapFolderPath = os.path.join(explPath, "heatmap_frequency")
        if (not os.path.exists(heatmapFolderPath)):
            os.makedirs(heatmapFolderPath)
            
        heatmapFilePath = os.path.join(heatmapFolderPath, f"Frequency_{c1}_by_{c2}.html")
        ut.heatmap.hv.save(layout, heatmapFilePath)
        print("Saved to", heatmapFilePath)

    return layout

In [None]:
def saveHeatmaps(heatmapFunc, fullDataFrame, dataFrame, columns,
                 width=600, height=600, saveFigure=False, fixed2ndColumns=None, **kwargs):
    ''''''
    from itertools import combinations
    ut.heatmap.hv.extension("bokeh")
    
    # Generate different combinations
    combs = []
    for c1, c2 in combinations(columns, 2):
        # If fixed2ndColumns is specified, then force c1 to be the 2nd column if it's in fixed2ndColumns
        if (fixed2ndColumns is not None and c1 in fixed2ndColumns):
            combs.append((c2, c1))
        else:
            # The column that has more values will be the 1st column
            if (len(fullDataFrame[c1].unique()) > len(fullDataFrame[c2].unique())):
                combs.append((c1, c2))
            else:
                combs.append((c2, c1))

    # Save each heatmap with different column combinations
    for c1, c2 in combs:
        heatmapFunc(dataFrame, c1=c1, c2=c2, width=width, height=height, 
                    saveFigure=saveFigure, **kwargs)

In [None]:
@widgets.interact_manual
def saveHeatmapsWidget():
    ''' '''
    saveHeatmaps(heatmap, data, columns=allCatColumns, 
                 dataFrame=df, width=1000, height=1500, 
                 saveFigure=True, fixed2ndColumns=["OCCUR_HOUR", "OCCUR_MONTH", "OCCUR_DATE", "WEEKDAY"])

### Column Frequencies

In [None]:
@widgets.interact_manual
def barChart(dataFrame=widgets.fixed(df), hue=widgets.fixed(None), exportPath=widgets.fixed(explPath),
             column=allCatColumns, width=(5, 25, 1), height=(5, 25, 1), 
             sortBy=widgets.RadioButtons(options=["both", "label", "count"]), 
             saveFigure=False):
    ''' '''
    numAxes = 2 if (sortBy == "both") else 1
    sns.set(style="darkgrid")
    fig, axes = plt.subplots(numAxes, 1, figsize=(width, height))
    
    xLabelsSortByCount = dataFrame[column].value_counts().index
    xLabelsSortByLabel = sorted(xLabelsSortByCount)
    
    if (sortBy == "label"):
        axes = sns.countplot(x=dataFrame[column], hue=hue, order=xLabelsSortByLabel)
        axes.set_xticklabels(axes.get_xticklabels(), rotation=40, ha="right")
    elif (sortBy == "count"):
        axes = sns.countplot(x=dataFrame[column], hue=hue, order=xLabelsSortByCount)
        axes.set_xticklabels(axes.get_xticklabels(), rotation=40, ha="right")
    else:
        sns.countplot(x=dataFrame[column], hue=hue, order=xLabelsSortByLabel, ax=axes[0])
        axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=40, ha="right")
        
        sns.countplot(x=dataFrame[column], hue=hue, order=xLabelsSortByCount, ax=axes[1])
        axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=40, ha="right")
        
    fig.tight_layout()
    
    if (saveFigure):
        colFreqFolderPath = os.path.join(exportPath, "column_frequencies")
        if (not os.path.exists(colFreqFolderPath)):
            os.makedirs(colFreqFolderPath)

        colFreqFilePath = os.path.join(colFreqFolderPath, f"{column}_frequency_cluster.png")
        fig.savefig(colFreqFilePath)
        print("Saved to", colFreqFilePath)

In [None]:
def saveBarCharts(barChartFunc, columns, dataFrame, exportPath, hue=None, 
                  width=25, height=15, sortBy="both", saveFigure=False):
    ''' '''
    # Save the column frequency plot for each column
    for col in columns:
        barChartFunc(dataFrame=dataFrame, hue=hue, exportPath=exportPath, column=col,
                     width=width, height=height, sortBy=sortBy, saveFigure=saveFigure)

In [None]:
@widgets.interact_manual
def saveFiguresWidget():
    ''' '''
    saveBarCharts(barChart, allCatColumns, df, hue=None, exportPath=explPath, 
                width=50, height=25, sortBy="both", saveFigure=True)

### Chroropleth

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
mapboxToken = os.getenv("MAPBOX_TOKEN")

In [None]:
@widgets.interact_manual
def plotChoropleth(dataFrame=widgets.fixed(df),
                   colorscale=widgets.fixed("ylorrd"), 
                   colormap=widgets.fixed(clusterColormaps[0]),
                   feature=["PRECINCT", "neighbourhood", "postcode"],
                   value=widgets.Text(),
                   hours=widgets.SelectMultiple(options=sorted(df.OCCUR_HOUR.unique()), value=["00"]),
                   includeEmpty=False,
                   method=widgets.RadioButtons(options=["leaflet", "plotly"]), 
                   saveFigure=False):
    ''' '''
    plotObj = None
    dataFrame = dataFrame[dataFrame.OCCUR_HOUR.isin(hours)]
    
    values = dict(dataFrame[feature].value_counts())
    geoJson = geoJsons[feature]
    
    # Only display the specificed values
    if (value != ""):
        tokens = set(value.split(","))
        values = dict(filter(lambda item: item[0] in tokens, values.items()))
        if (len(values) != len(tokens)):
            raise ValueError(f"Value {value} is invalid for feature {feature}.")
    
    if (method == "leaflet"):    
        plotObj = ut.geo.plotLeaflet(geoJson, values, None, includeEmpty=includeEmpty,
                                     heatmapArgs={"name": f"{feature} Choropleth {yearsStr}", "colormap": colormap},
                                     mapArgs={"center": newYorkCoord, "zoom": 10,
                                              "basemap": ut.geo.lf.basemaps.CartoDB.DarkMatter},
                                     labelFormat=f"{feature}={0} - Count={1}")
    else:
        valuesDf = pd.DataFrame(values.items(), columns=[feature, "count"])
        plotObj = ut.geo.plotPlotly(geoJson, valuesDf, feature, "count", title=f"{feature} {yearsStr}",
                                    center={"lat": newYorkCoord[0], "lon": newYorkCoord[1]}, 
                                    zoom=9, labels={'count':'number of incidents'}, includeEmpty=includeEmpty)
    if (saveFigure):
        if (method == "leaflet"):
            print("Can only save with method \"plotly\"")
        else:
            choroplethFilePath = os.path.join(explPath, f"{feature}_choropleth.html")
            ut.geo.plotly.offline.plot(plotObj, filename=choroplethFilePath, auto_open=False)
            print("Saved to", choroplethFilePath)
        
    return plotObj

In [None]:
def getCoordinates(coordinates, coords):
    ''' '''
    if (len(coordinates) == 2 and (type(coordinates[0]) is float) or type(coordinates[0]) is int):
        coords.append(tuple(coordinates))
    else:
        for c in coordinates:
            getCoordinates(c, coords)

def findCentroidGeoJson(coordinates):
    ''' '''
    def _findCentroid(lats, lons):
        return (sum(lats) / len(lats)), (sum(lons) / len(lons))
    
    coords = []
    getCoordinates(coordinates, coords)
    lats = [coord[1] for coord in coords]
    lons = [coord[0] for coord in coords]
    
    return _findCentroid(lats, lons)
    
#findCentroid(precinctGeoJson["features"][0]["geometry"]["coordinates"])

In [None]:
def plotCentroids(geoJson):
    
    idTexts = []
    latCentroids = []
    lonCentroids = []
    for feature in geoJson["features"]:
        lat, lon = findCentroidGeoJson(feature["geometry"]["coordinates"])
        latCentroids.append(lat)
        lonCentroids.append(lon)
        idTexts.append(feature["id"])
    
    centroids = go.Scattermapbox(lat=latCentroids, 
                                 lon=lonCentroids,
                                 marker = {'color': 'white', 'opacity': 1, 'size': 25},
                                 mode='markers+text',
                                 showlegend=False,
                                 hoverinfo='skip',
                                 text=idTexts)
    return centroids

In [None]:
def selectPrecincts(df, precincts):
    dataFrame = pd.DataFrame({"precinct": df.PRECINCT.unique(), "count": 0})
    
    nonSelectedDf = dataFrame[~dataFrame["precinct"].isin(precincts)]
    nonSelectedChoro = go.Choroplethmapbox(name="Non-Selected",
                        z=nonSelectedDf['count'],
                        locations=nonSelectedDf['precinct'], 
                        below=True,
                        colorscale="OrRd_r",
                        showscale=False,
                        showlegend=True,
                        geojson=precinctGeoJson,
                        hovertemplate="Precinct: %{location}",
                        marker_line_width=0.1, marker_opacity=1) 

    selectedDf = dataFrame[dataFrame["precinct"].isin(precincts)]
    selectedChoro = go.Choroplethmapbox(name="Selected",
                        z=selectedDf['count'],
                        locations = selectedDf['precinct'],
                        below=True,
                        colorscale="Greys",
                        showscale=False,
                        showlegend=True,
                        geojson = precinctGeoJson,
                        marker_line_width=0.1, marker_opacity=1) 
    filteredGeoJson = {}
    filteredGeoJson["features"] = list(filter(lambda f: f["id"] in precincts, precinctGeoJson["features"]))
    centroids = plotCentroids(filteredGeoJson)
    
    layout = dict(mapbox_style="light",
                  mapbox_center={"lat": newYorkCoord[0], "lon": newYorkCoord[1]},
                  margin=dict(l=0, t=0, r=0, b=0, pad=0),
                  mapbox_zoom=9,
                  mapbox=dict(accesstoken=mapboxToken))
    
    fig = go.Figure(data=[nonSelectedChoro, selectedChoro, centroids], layout=layout)
    fig.show()

In [None]:
def pp():
    import plotly.graph_objects as go
    precinctPopDf = pd.read_csv(r"C:\Users\huybn\Downloads\nyc_precinct_population.csv", header=0, sep=",")
    choropleth = go.Choroplethmapbox(z=precinctPopDf['population'],
                            locations = precinctPopDf['precinct'], 
                            colorscale="ylorrd",
                            below=True,
                            geojson = precinctGeoJson,
                            hoverinfo ='none',
                            marker_line_width=0.1, marker_opacity=1) 
    centroids = plotCentroids(precinctGeoJson)
    
    layout = dict(mapbox_style="light",
                  mapbox_center = {"lat": newYorkCoord[0], "lon": newYorkCoord[1]},
                  margin=dict(l=0, t=0, r=0, b=0, pad=0),
                  mapbox=dict(accesstoken=mapboxToken, zoom=9))
    fig = go.Figure(data=[choropleth, centroids], layout=layout)
    fig.show()
pp()

# Frequent Patterns Analysis

In [None]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori

In [None]:
fpPath = os.path.join(resultsPath, "frequentPatterns")
if (not os.path.exists(fpPath)):
    os.makedirs(fpPath)
print(fpPath)

In [None]:
itemsDf = None
@widgets.interact_manual
def frequentPatterns(dataFrame=widgets.fixed(df), 
                     columns=widgets.SelectMultiple(options=allCatColumns),
                     minSupport="0.6",
                     frequentPatternMethod=["fpgrowth", "apriori"],
                     includeOnlyValues="",
                     excludeOnlyValues="",
                     minItemSetCount=widgets.IntSlider(min=0, max=20, step=1, value=0),
                     maxItemSetCount=widgets.IntSlider(min=0, max=20, step=1, value=0),
                     save=False):
    ''' '''
    global itemsDf
    minSupport = float(minSupport)
    if (minSupport < 0.0 or minSupport > 1.0):
        raise ValueError("Minimum support must be between 0 and 1 inclusive.")
        
    if (minItemSetCount > maxItemSetCount):
        raise ValueError("maxItemSetCount must be greater than or equal to minItemSetCount")
        
    fqFunc = fpgrowth if (frequentPatternMethod == "fpgrowth") else apriori
    
    # Split the string by comma
    if (includeOnlyValues != ""):
        includeOnlyValues = {token.strip() for token in includeOnlyValues.split(",")}
    else:
        includeOnlyValues = None

    if (excludeOnlyValues != ""):
        excludeOnlyValues = {token.strip() for token in excludeOnlyValues.split(",")}
    else:
        excludeOnlyValues = None
    
    # Convert data frame to one-hot encoded data frame
    if (len(columns) > 0):
        dataFrame = dataFrame[list(columns)]
    dummiesDf = pd.get_dummies(dataFrame)
    
    # Run the frequent pattern algorithm
    patternsDf = fqFunc(dummiesDf, min_support=minSupport, use_colnames=True)
    patternsDf.sort_values("support", ascending=False, inplace=True)
    patternsDf.reset_index(drop=True, inplace=True)
    
    # Filter by the number of items
    if (len(patternsDf["itemsets"]) > 0 and (minItemSetCount != 0 or maxItemSetCount != 0)):
        itemSetsLen = patternsDf["itemsets"].str.len()
        mask = (itemSetsLen >= minItemSetCount) & (itemSetsLen <= maxItemSetCount)
        patternsDf = patternsDf.loc[mask]
    
    # Transform data frame into data frame that has 
    # Get the number of unique items
    uniqueItems = set()
    for _, r in patternsDf.iterrows():
        uniqueItems = uniqueItems.union(r["itemsets"])
    itemColumns = ["support", "size", "numRows"] + [f"item_{i+1}" for i in range(len(uniqueItems))]
    
    itemsDf = pd.DataFrame(columns=itemColumns, index=patternsDf.index)
    for i, r in patternsDf.iterrows():
        support, itemSets = r["support"], sorted(r["itemsets"])
        itemsDf.loc[i]["support"] = support
        itemsDf.loc[i]["size"] = len(itemSets)
        itemsDf.loc[i]["numRows"] = support * dataFrame.shape[0]
        for item, col in zip(itemSets, itemColumns[3:]):
            itemsDf.loc[i][col] = item
            
    # Filter by the includeOnlyValues 
    if (includeOnlyValues is not None):
        print(includeOnlyValues)
        indices = []
        for i, row in itemsDf.iterrows():
            intersections = set(row.values).intersection(includeOnlyValues)
            if (len(intersections) > 0):
                indices.append(i)
        itemsDf = itemsDf[itemsDf.index.isin(indices)]
        
    # Filter by the excludeOnlyValues 
    if (excludeOnlyValues is not None):
        print(excludeOnlyValues)
        indices = set()
        for i, row in itemsDf.iterrows():
            intersections = set(row.values).intersection(excludeOnlyValues)
            if (len(intersections) == 0):
                indices.add(i)
        itemsDf = itemsDf[itemsDf.index.isin(indices)]
    
    # Drop NaN columns
    itemsDf.dropna(axis=1, how="all", inplace=True)
        
    if (save):
        fpFilePath = os.path.join(fpPath, f"{frequentPatternMethod}_{minSupport}_{len(itemsDf)}.csv")
        itemsDf.to_csv(fpFilePath, index=False)
        print("Saved to", fpFilePath)

    display(itemsDf)
    print(itemsDf.shape)
    gc.collect()

In [None]:
def getValuesFromItemsDf(itemsDf, column, prefix):
    ''''''
    return [i.split(f"{prefix}_")[1] for i in itemsDf[column]]
#getValuesFromItemsDf(itemsDf, "item_2", "PRECINCT")

In [None]:
def foo():
    precincts = getValuesFromItemsDf(itemsDf, "item_2", "PRECINCT")
    tempDf = df[df.PRECINCT.isin(precincts)]
    
    fig = ut.heatmap.freqHeatMap(tempDf, "PRECINCT", "WEEKDAY",
                           tempDf.PRECINCT.unique(), tempDf.WEEKDAY.unique(),
                           width=800, height=800, tools=["hover"], colorbar=True, cmap="viridis")
    return fig
foo()

In [None]:
def foo(itemsDf, df):
    queries = []
    for i, row in itemsDf.iterrows():
        hour = row["item_1"].split("_")[-1]
        precinct = row["item_2"].split("_")[-1]
        weekday = "_".join(row["item_3"].split("_")[1:])
        query = f'(OCCUR_HOUR == "{hour}" and PRECINCT == "{precinct}" and WEEKDAY == "{weekday}")'
        queries.append(query)
    finalQuery = " or ".join(queries)
    #print(finalQuery)
    return df.query(finalQuery)

In [None]:
def bar(itemsDf, df):
#     precincts = [i.split("_")[1] for i in itemsDf["item_2"].value_counts().index]
#     return df[df.PRECINCT.isin(precincts)]
    weekdays = [i.split("WEEKDAY_")[1] for i in itemsDf["item_3"].value_counts().index]
    return df[df.WEEKDAY.isin(weekdays)]

In [None]:
ut.heatmap.hv.extension("bokeh", logo=False)
ut.heatmap.freqHeatMap(bar(itemsDf, df), "WEEKDAY", "OCCUR_HOUR", 
                       width=800, height=800, tools=["hover"], colorbar=True, cmap="viridis")

In [None]:
# def plotChoropleth(dataFrame=widgets.fixed(df),
#                    colorscale=widgets.fixed("ylorrd"), 
#                    colormap=widgets.fixed(clusterColormaps[0]),
#                    feature=["PRECINCT", "neighbourhood", "postcode"],
#                    value=widgets.Text(),
#                    hours=widgets.SelectMultiple(options=sorted(df.OCCUR_HOUR.unique()), value=["00"]),
#                    includeEmpty=False,
#                    method=widgets.RadioButtons(options=["leaflet", "plotly"]), 
#                    saveFigure=False):

# plotChoropleth(dataFrame=df, colorscale="ylorrd", colormap=clusterColormaps[0], value="",
#                feature="PRECINCT", hours=df.OCCUR_HOUR.unique(), method="plotly")

plotChoropleth(dataFrame=foo(itemsDf, df), colorscale="ylorrd", colormap=clusterColormaps[0], value="",
               feature="PRECINCT", hours=df.OCCUR_HOUR.unique(), method="plotly")

In [None]:
"[" + ",".join(['"'+i.split("_")[1]+'"' for i in itemsDf["item_2"].value_counts().sort_index().index]) + "]"

In [None]:
selectPrecincts(df, getValuesFromItemsDf(itemsDf, "item_2", "PRECINCT"))

In [None]:
def foo():
    #data.insert(loc=6, column="WEEKDAY", value=data.apply(getWeekDay, axis=1))
    global df
    isWeekend = lambda w: w in ["5_Saturday", "6_Sunday"]
    df["IS_WEEKEND"] = df.apply(lambda r: isWeekend(r["WEEKDAY"]), axis=1)
foo()

In [None]:
ut.geo.plotScatterOnMap(df, "Latitude", "Longitude", scatterKwargs={"color": "IS_WEEKEND"})

In [None]:
def findCentroid(coords):
    ''' '''
    lats = [c[0] for c in coords]
    lons = [c[1] for c in coords]
    return sum(lats) / len(coords), sum(lons) / len(coords)

In [None]:
def foo(df, *columns):
    import itertools
#     filteredDf = df
#     weekend = ["weekend" if w else "weekday" for w in filteredDf.IS_WEEKEND]
#     tempDf = pd.DataFrame({"lat": filteredDf.Latitude, "lon": filteredDf.Longitude, "color": weekend})
    
#     centroidWeekend = findCentroid(getLocations(df[df.IS_WEEKEND == True], "Latitude", "Longitude"))
#     centroidWeekday = findCentroid(getLocations(df[df.IS_WEEKEND == False], "Latitude", "Longitude"))
    
#     tempDf = tempDf.append({"lat": centroidWeekend[0], "lon": centroidWeekend[1], 
#                             "color": "centroid_Weekend"}, ignore_index=True)
#     tempDf = tempDf.append({"lat": centroidWeekday[0], "lon": centroidWeekday[1], 
#                             "color": "centroid_Weekday"}, ignore_index=True)
    
#     return ut.geo.plotScatterOnMap(tempDf, "lat", "lon", scatterKwargs={"color": "color"})

    allColValues = []
    for col in columns:
        allColValues.append([])
        for colValue in df[col].unique():
            allColValues[-1].append(f"{colValue}")
    
    valueCombs = list(itertools.product(*allColValues))
    
    queries = []
    for vc in valueCombs:
        query = f"{columns[0]} == \"{vc[0]}\" and "
        for i in range(1, len(columns)):
            query += f"{columns[i]} == \"{vc[i]}\""
        queries.append(query)
    
    tempDf = pd.DataFrame(columns=["lat", "lon", "color"])
    centroids = []
    for vc, query in zip(valueCombs, queries):
        vcStr = "-".join(vc)
        filteredDf = df.query(query)
        #tempDf = tempDf.append({"lat": filteredDf.Latitude, "lon": filteredDf.Longitude, "color": vcStr})
        
        lat, lon = findCentroid(getLocations(filteredDf, "Latitude", "Longitude"))
        tempDf = tempDf.append({"lat": lat, "lon": lon, "color": vcStr}, ignore_index=True)
        
    return ut.geo.plotScatterOnMap(tempDf, "lat", "lon", scatterKwargs={"color": "color"})
    
    
#     colValues = [f"{3}" for v in df[col]]
#     tempDf = pd.DataFrame({"lat": df.Latitude, "lon": df.Longitude, "color": colValues})
#     tempDf.color = tempDf.color.astype(str)
    
#     centroids = []
#     for v in df[column].unique():
#         filteredDf = df[df[column] == v]
#         lat, lon = findCentroid(getLocations(filteredDf, "Latitude", "Longitude"))
#         centroids.append({"lat": lat, "lon": lon, "color": f"centroid-{column}-{v}"})
#     tempDf = tempDf.append(centroids, ignore_index=True)
    
#     return ut.geo.plotScatterOnMap(tempDf, "lat", "lon", scatterKwargs={"color": "color"})

foo(df, "OCCUR_HOUR", "WEEKDAY")

In [None]:
df[df.PRECINCT.isin(["47", "52", "49", "46", "48", "44", "42", "40", "43", "28", "23", "114"])].IS_WEEKEND.value_counts()

In [None]:
df[df.PRECINCT.isin(["90", "79", "81", "73", "75", "106", "67", "70"])].IS_WEEKEND.value_counts().sum()

In [None]:
# Heat map of upper and lower regions
def foo():
    f = itemsDf[~itemsDf.item_2.isin(["PRECINCT_47", "PRECINCT_52", "PRECINCT_49", "PRECINCT_46", 
                              "PRECINCT_48", "PRECINCT_44", "PRECINCT_42", "PRECINCT_40", 
                              "PRECINCT_43", "PRECINCT_28", "PRECINCT_23", "PRECINCT_114"])]\
         .sort_values(["item_1","item_3"]).drop("item_2", axis=1)
    
    ut.heatmap.hv.extension("bokeh", logo=False)
    return ut.heatmap.freqHeatMap(f, "item_1", "item_3", itemsDf.item_1.unique(), itemsDf.item_3.unique(),
                           width=1200, height=800, tools=["hover"], colorbar=True, cmap="viridis")
# ut.heatmap.hv.save(foo(), r'.\lower.html')

In [None]:
itemsDf[itemsDf.item_2.isin(["PRECINCT_47", "PRECINCT_52", "PRECINCT_49", "PRECINCT_46", 
                              "PRECINCT_48", "PRECINCT_44", "PRECINCT_42", "PRECINCT_40", 
                              "PRECINCT_43", "PRECINCT_28", "PRECINCT_23", "PRECINCT_114"])]\
  .sort_values(["item_1","item_3"]).drop("item_2", axis=1)

In [None]:
itemsDf[~itemsDf.item_2.isin(["PRECINCT_47", "PRECINCT_52", "PRECINCT_49", "PRECINCT_46", 
                              "PRECINCT_48", "PRECINCT_44", "PRECINCT_42", "PRECINCT_40", 
                              "PRECINCT_43", "PRECINCT_28", "PRECINCT_23", "PRECINCT_114"])]\
  .sort_values(["item_1","item_3"]).drop("item_2", axis=1)

In [None]:
def foo():
    v = ",".join([i.split("_")[1] for i in itemsDf["item_2"].value_counts().sort_index().index])
    fig = plotChoropleth(dataFrame=df, colorscale="ylorrd", colormap=clusterColormaps[0], value=v,
               feature="PRECINCT", hours=df.OCCUR_HOUR.unique(), method="plotly")
    return fig
foo()

In [None]:
itemsDf["item_1"].value_counts().sort_index().plot.bar()

# Cluster Analysis

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
data.sample(3)

In [None]:
dbscan = DBSCAN(eps=1.5/6371., min_samples=5, algorithm='ball_tree', metric='haversine')
l = dbscan.fit_predict(np.radians(df[["Latitude", "Longitude"]])[:1000])

In [None]:
u, c = np.unique(l, return_counts=True)
dict(zip(u, c))

In [None]:
x = df[["Latitude", "Longitude"]][:1000]
x["label"] = l
x = x.astype({"label": str})
ut.geo.plotScatterOnMap(x, "Latitude", "Longitude", {"color": "label"})

## ST-DBSCAN

In [None]:
from models.unsupervised.stDbscan import StDbscan
import plotly.express as px
import plotly.graph_objects as go

In [None]:
stdbscanPath = os.path.join(resultsPath, "st-dbscan")
if (not os.path.exists(stdbscanPath)):
    os.makedirs(stdbscanPath)
print(stdbscanPath)

In [None]:
timeDf = df[['OCCUR_MONTH', 'OCCUR_DATE', 'OCCUR_HOUR', 'OCCUR_MINUTE', 'OCCUR_SECOND']]
timeDf["OCCUR_HOUR"]         = timeDf["OCCUR_HOUR"].apply(lambda x: prependZeros(str(x), 2))
timeDf["OCCUR_MINUTE"]       = timeDf["OCCUR_MINUTE"].apply(lambda x: prependZeros(str(x), 2))
timeDf["OCCUR_SECOND"]       = timeDf["OCCUR_SECOND"].apply(lambda x: prependZeros(str(x), 2))

# timeDf["time"] = timeDf.apply(lambda row: pd.to_datetime(f"2020-{row[0]}-{row[1]} {row[2]}:{row[3]}:{row[4]}"), axis=1)
timeDf["time"] = timeDf.apply(lambda row: pd.to_datetime(f"{row[2]}:{row[3]}:{row[4]}"), axis=1)

# Convert to second
timeDf["time"] = pd.to_datetime(timeDf['time']).astype(np.int64) / 10**9 

# To revert back, use line below
# pd.to_datetime(timeDf["time"], unit='s')

In [None]:
def convertToSeconds(value: float, unit: str) -> float:
    ''' '''
    secondsPerMinute = 60
    secondsPerHour = secondsPerMinute * 60
    secondsPerDay = secondsPerHour * 24
    units = {"day": secondsPerDay, "hour": secondsPerHour, "minute": secondsPerMinute}
    
    return units[unit] * value

def timeDistanceMetric(s1: [pd.Timestamp], s2: [pd.Timestamp]) -> float:
    ''' '''
    if (len(s1) != len(s2)):
        raise ValueError("Lenght of s1 and s2 must be equal")
        
    import math

    distance = 0
    for i, j in zip(s1, s2):
        distance += ((i - j).total_seconds())**2
    
    if (len(s1) > 0):
        distance = math.sqrt(distance)
        
    return distance

In [None]:
def getTimeDf(dataFrame: pd.DataFrame, timeFeatures: list, allTimeFeatures: list) -> pd.DataFrame:
    ''' '''
    timeStampFormat = "2021"
    indices = []
    formatIndex = 0
    
    for i, feature in enumerate(allTimeFeatures):
        if (feature == "OCCUR_MONTH" or feature == "OCCUR_DATE"):
            if (feature in timeFeatures):
                timeStampFormat += f"-{{{formatIndex}}}"
                indices.append(i)
                formatIndex += 1
            else:
                timeStampFormat += "-01"
        
        elif (feature == "OCCUR_HOUR"):
            if (feature in timeFeatures):
                timeStampFormat += f" {{{formatIndex}}}"
                indices.append(i)
                formatIndex += 1
            else:
                timeStampFormat += " 00"
                
        else:
            if (feature in timeFeatures):
                timeStampFormat += f":{{{formatIndex}}}"
                indices.append(i)
                formatIndex += 1
            else:
                timeStampFormat += ":00"
    
    timeDf = dataFrame[allTimeFeatures]
    for col in timeDf.columns:
        timeDf[col] = timeDf[col].apply(lambda x: prependZeros(str(x), 2))
    
    timeSeries = None
    try:
        timeSeries = timeDf.apply(lambda row: pd.to_datetime(timeStampFormat.format(*row[indices])), axis=1)
    except Exception as ex:
        raise Exception("Invalid selections of time feature")
        
    return timeSeries

# getTimeDf(df, [ "OCCUR_HOUR"], ["OCCUR_MONTH", "OCCUR_DATE", "OCCUR_HOUR", "OCCUR_MINUTE", "OCCUR_SECOND"])

In [None]:
def sumTimeIntervalSeries(series, hoursPerInterval=6):
    ''' '''
    numHours = 24
    
    startIntervals = np.arange(0, numHours+1, hoursPerInterval)
    # Add hour 24 if it's not included
    if (startIntervals[-1] < 24):
        startIntervals = np.append(startIntervals, 24)
    
    data = {}
    isInInterval = lambda start, stop, value: start <= value <= stop 
    
    for i in range(len(startIntervals)-1):
        start = pd.to_datetime(f"2021-01-01 {startIntervals[i]}:00:00")
        stop = pd.to_datetime(f"2021-01-01 {startIntervals[i+1]-1}:59:00")
        filteredSeries = series[[isInInterval(start, stop, i) for i in series.index]]
        
        key = f"{prependZeros(str(startIntervals[i]), 2)}-{prependZeros(str(startIntervals[i+1]), 2)}"
        data[key] = filteredSeries.sum()
        
    return pd.Series(data, index=data.keys()).sort_index()

#sumTimeIntervalSeries(stDf[stDf["label"] == "30"]["timestamp"].value_counts(), 7)

In [None]:
def getTimeIntervalDf(stDf, hoursPerInterval=6):
    ''' '''
    data = {}
    for label in stDf.label.unique():
        series = stDf[stDf["label"] == label]["timestamp"].value_counts()
        sumIntervalSeries = sumTimeIntervalSeries(series, hoursPerInterval)
        data[label] = sumIntervalSeries
    df = pd.DataFrame(data).T
    df.index.name = "cluster"
    
    return df

In [None]:
st = None
stDf = None
EarthRadiusKm = 6371
AllTimeFeatures = ["OCCUR_MONTH", "OCCUR_DATE", "OCCUR_HOUR", "OCCUR_MINUTE", "OCCUR_SECOND"]

# Defined widgets
eps2UnitWidget = widgets.Dropdown(options=["day", "hour", "minute"], description="Time unit")
eps2Widget = widgets.FloatSlider(min=0.01, value=20, max=200, step=0.1, description="Eps2 (day)")
timeFeaturesWidget = widgets.SelectMultiple(options=AllTimeFeatures, 
                                            description="Temporal features", 
                                            style={'description_width': 'initial'})
def updateEps2Widget(change):
    ''' '''
    eps2Widget.description = f"Eps2 ({change['owner'].value})"
    
eps2UnitWidget.observe(updateEps2Widget)
eps2UnitWidget.value = "day"

@widgets.interact_manual
def runStDbscan(dataFrame=widgets.fixed(df),
                eps1=widgets.FloatSlider(min=0.1, value=1.5, max=5, step=0.01, description="Eps1 (km)"),
                timeFeatures=timeFeaturesWidget,
                eps2Unit=eps2UnitWidget,
                eps2=eps2Widget,
                minSamples=(2,20,1),
                saveFigure=False):
    ''' '''
    global st
    global stDf
    
    timeFeatures = list(timeFeatures)
    print(f"{eps2Unit}: {convertToSeconds(eps2, eps2Unit)}")
    
    stDf = dataFrame[["Latitude", "Longitude", "WEEKDAY"]]
    stDf["Latitude_radian"] = np.radians(stDf.Latitude)
    stDf["Longitude_radian"] = np.radians(stDf.Longitude)
    stDf["timestamp"] = getTimeDf(dataFrame, timeFeatures, AllTimeFeatures)
    stDf["timestamp_str"] = stDf["timestamp"].apply(str)
    stDf["timestamp_second"] = pd.to_datetime(stDf['timestamp']).astype(np.int64) / 10**9 
    
    st = StDbscan(eps1=eps1/EarthRadiusKm, 
                  eps2=convertToSeconds(eps2, eps2Unit), 
                  minSamples=minSamples)
    
    labels = st.fitPredict(stDf, ["Latitude_radian", "Longitude_radian"], ["timestamp_second"])

    # Convert label to string to diplay on plot legend
    stDf["label"] = list(map(lambda x: prependZeros(str(x), 2), labels))
    stDf.sort_values(by="label", ascending=True, inplace=True)
    
    #
    u, c = np.unique(labels, return_counts=True)
    labelDf = pd.DataFrame(dict(zip(u, c)).items(), columns=["label", "count"])
    noiseCount = len(list(filter(lambda x: x == -1, labels)))
    
    figTitle = f"ST-DBSCAN - {timeFeatures} - # clusters = {len(set(labels))-1} - # noise samples = {noiseCount}<br>" + \
               f"Eps1: {eps1} (km) - Eps2: {eps2} ({eps2Unit}) - MinSamples: {minSamples}"
               
    fig = ut.geo.plotScatterOnMap(stDf, "Latitude", "Longitude", 
                                  {"color": "label", "hover_data": ["timestamp_str"]},
                                  {"title_text": figTitle})
    
    display(fig)
    if (saveFigure):
        figFilePath = os.path.join(stdbscanPath, f"st-dbscan_{eps1}_{eps2}_{minSamples}.html")
        ut.geo.plotly.offline.plot(fig, filename=figFilePath, auto_open=False)
        print("Saved to", figFilePath)

In [None]:
from models.unsupervised.stDbscan import StDbscan
runStDbscan(df, 0.9, ["OCCUR_HOUR", "OCCUR_MINUTE"], "hour", 3.51, 5, False)

In [None]:
@widgets.interact_manual
def plotTime(dataFrame=widgets.fixed(stDf), 
             labels=widgets.SelectMultiple(options=stDf["label"].unique()),
             saveFigure=False):
    ''' '''
    if (labels is None or len(labels) == 0):
        labels = dataFrame["label"].unique()
    
    fig = go.Figure()
    for label in sorted(labels):
        series = dataFrame[dataFrame["label"] == label]["timestamp"].value_counts().sort_index()
        graphObj = go.Scatter(x=series.index, y=series.values, mode='lines+markers', name=label,
                              hovertemplate="<br>Hour: %{text}<br>Frequency: %{y}",
                              text=[i.strftime("%H:%M:%S") for i in series.index])
        fig.add_trace(graphObj)

    fig.update_layout({"dragmode": "pan"})
    fig.show(config={"scrollZoom": True})
    
    if (saveFigure):
        figFilePath = os.path.join(stdbscanPath, f"time_st-dbscan.html")
        ut.geo.plotly.offline.plot(fig, filename=figFilePath, auto_open=False)
        print("Saved to", figFilePath)

In [None]:
plotTime(stDf, stDf.groupby(["label"]).filter(lambda x: len(x) <= 100).label.unique(), True)

In [None]:
@widgets.interact_manual
def plotBarTime(dataFrame=widgets.fixed(stDf),
                labels=widgets.SelectMultiple(options=stDf["label"].unique()),
                hoursPerInterval=(1,12,1),
                title="Interval Bar Chart",
                saveFigure=False,
                saveFigureName="time_bar_st-dbscan.html"):
    ''' '''
    if (labels is None or len(labels) == 0):
        labels = dataFrame["label"].unique()
    
    timeIntervalDf = getTimeIntervalDf(dataFrame[dataFrame["label"].isin(labels)], hoursPerInterval)
    cols = [f"Interval {c}" for c in timeIntervalDf.columns]
    fig = go.Figure()
    
    for cluster, row in timeIntervalDf.iterrows():
        fig.add_trace(go.Bar(name=cluster, x=cols, y=row))
    
    fig.update_layout(barmode='group', dragmode="pan", 
                      xaxis_title="Intervals", yaxis_title="Count",
                      title=title, legend_title_text="Clusters")
    fig.show(config={"scrollZoom": True})
    
    if (saveFigure):
        figFilePath = os.path.join(stdbscanPath, saveFigureName)
        ut.geo.plotly.offline.plot(fig, filename=figFilePath, auto_open=False)
        print("Saved to", figFilePath)

In [None]:
@widgets.interact_manual
def plotBarCol(dataFrame=widgets.fixed(stDf),
               column=allCatColumns,
               title="Interval Bar Chart",
               saveFigure=False,
               saveFigureName="time_bar_st-dbscan.html"):
    ''' '''
    #cols = [f"Interval {c}" for c in dataFrame[column].unique()]
    #indexes = resultDf[resultDf.label == label].index
    #clusterDf = df.iloc[indexes]
    fig = go.Figure()
    
    for label in dataFrame.label.unique():
        labelDf = dataFrame[dataFrame.label == label]
        tempDf = df.iloc[labelDf.index]
        series = tempDf[column].value_counts()
        fig.add_trace(go.Bar(name=label, x=series.index, y=series.values))

    fig.update_layout(barmode='group', dragmode="pan", 
                      xaxis_title="Intervals", yaxis_title="Count",
                      title=title, legend_title_text="Clusters")
    fig.show(config={"scrollZoom": True})
    
    if (saveFigure):
        figFilePath = os.path.join(stdbscanPath, saveFigureName)
        ut.geo.plotly.offline.plot(fig, filename=figFilePath, auto_open=False)
        print("Saved to", figFilePath)

In [None]:
plotBarCol(stDf.groupby(["label"]).filter(lambda x: len(x) <= 100), "WEEKDAY")

In [None]:
ut.geo.plotScatterOnMap(stDf[stDf.label.isin(["17", "05", "02"])], "Latitude", "Longitude", 
                        {"color": "label", "hover_data": ["timestamp_str"]})

In [None]:
def foo():
    labels = stDf.groupby(["label"]).filter(lambda x: len(x) <= 100).label.unique()
    timeIntervalDf = getTimeIntervalDf(stDf[stDf["label"].isin(labels)], 3)
    #display(timeIntervalDf)
    return timeIntervalDf.mean(), timeIntervalDf.std()
foo()

In [None]:
def foo():
    labels = stDf.groupby(["label"]).filter(lambda x: len(x) > 100).label.unique()
    timeIntervalDf = getTimeIntervalDf(stDf[stDf["label"].isin(labels)], 3)
    #display(timeIntervalDf)
    return timeIntervalDf.std()
foo()

In [None]:
plotBarTime(stDf,
            stDf.groupby(["label"]).filter(lambda x: len(x) <= 100).label.unique(), 
            3, "Clusters with count <= 100")

In [None]:
plotBarTime(stDf,
            stDf.groupby(["label"]).filter(lambda x: len(x) > 100).label.unique(), 
            3, "Clusters with count > 100")

In [None]:
@widgets.interact_manual
def intervalClusters(dataFrame=widgets.fixed(stDf), hoursPerInterval=(2,12,1), threshold=(0.5, 1.0, 0.05)):
    resultDf = dataFrame.copy(True)
    resultDf["interval"] = "UNKNOWN"
    timeIntervalDf = getTimeIntervalDf(resultDf, hoursPerInterval)
    timeIntervalDf = timeIntervalDf[~timeIntervalDf.index.isin(["-1"])]
    
    def addInterval(row, label, interval):
        if (row["label"] == label and row["interval"] == "UNKNOWN"):
            return interval
        return row["interval"]
    
    ratios = []
    for i, r in timeIntervalDf.iterrows():
        largestItem = r.nlargest(1)
        interval, count = str(largestItem.index[0]), largestItem[0]
        ratio = count / r.sum()
        if (ratio < threshold):
            interval = "DIVERSE"
        
        resultDf["interval"] = resultDf.apply(lambda row: addInterval(row, i, interval), axis=1)
        ratios.append(ratio)
        #print(f"cluster={i}, interval={interval}, ratio={ratio}")
        
    series = pd.Series(ratios)
    mean, std = series.mean(), series.std()
    
    points = go.Scatter(x=series.index, y=series.values, mode='markers', name="ratios")
    meanLine = go.Scatter(x=series.index, y=[mean for v in series], mode='lines', name="mean")
    thresholdLine = go.Scatter(x=series.index, y=[threshold for v in series], mode='lines', name="threshold")
    statFig = go.Figure([points, meanLine, thresholdLine])
    for i in range(1, 4):
        plusStd, minusStd = mean+(i*std), mean-(i*std)
        statFig.add_hrect(y0=plusStd, y1=minusStd, 
                          annotation_text=f"std {i} +/-{i*std}", fillcolor=colors[i], 
                          opacity=0.25, line_width=0)
    statFig.update_layout(title=f"% of frequency of largest interval. Mean={mean}, Std={std}",
                          dragmode="pan",
                          xaxis_title="Clusters",
                          yaxis_title="Percentage")
    statFig.show(config={"scrollZoom": True})
    
    intervals = sorted(resultDf["interval"].unique())
    fig = ut.geo.plotScatterOnMap(resultDf, "Latitude", "Longitude", 
                                  {"color": "interval", "hover_data": ["timestamp_str", "label"], 
                                   "category_orders": {"interval": intervals}},
                                  {"title_text": "Interval ST-DBSCAN"})
    fig.show()
    return resultDf["interval"].value_counts()

In [None]:
@widgets.interact_manual
def characClusters(dataFrame=widgets.fixed(stDf), column=allCatColumns, threshold=(0.5, 1.0, 0.05)):
    resultDf = dataFrame.copy(True)
    resultDf["largest_value"] = "UNKNOWN"
    
    ratios = []
    for label in resultDf.label.unique():
        if (label == "-1"): continue
        
        indexes = resultDf[resultDf.label == label].index
        clusterDf = df.iloc[indexes]
        
        valueCounts = clusterDf[column].value_counts()
        largestItem = valueCounts.nlargest(1)
        value, count = str(largestItem.index[0]), largestItem[0]
        ratio = count / valueCounts.sum()
        if (ratio >= threshold):
            resultDf.at[indexes, "largest_value"] = value
        else:
            resultDf.at[indexes, "largest_value"] = "DIVERSE"
        
        ratios.append(ratio)
        
    series = pd.Series(ratios)
    mean, std = series.mean(), series.std()
    
    points = go.Scatter(x=series.index, y=series.values, mode='markers', name="ratios")
    meanLine = go.Scatter(x=series.index, y=[mean for v in series], mode='lines', name="mean")
    thresholdLine = go.Scatter(x=series.index, y=[threshold for v in series], mode='lines', name="threshold")
    statFig = go.Figure([points, meanLine, thresholdLine])
    for i in range(1, 4):
        plusStd, minusStd = mean+(i*std), mean-(i*std)
        statFig.add_hrect(y0=plusStd, y1=minusStd, 
                          annotation_text=f"std {i}", fillcolor=colors[i], 
                          opacity=0.25, line_width=0)
    statFig.update_layout(title=f"% of frequency of largest value. Mean={mean}, Std={std}",
                          xaxis_title="Clusters",
                          yaxis_title="Percentage")
    statFig.show()
    
    values = sorted(resultDf["largest_value"].unique())
    fig = ut.geo.plotScatterOnMap(resultDf, "Latitude", "Longitude", 
                                  {"color": "largest_value", "hover_data": ["timestamp_str", "label"], 
                                   "category_orders": {"largest_value": values}},
                                  {"title_text": f"{column} ST-DBSCAN"})
    fig.show()

In [None]:
def foo():
    tempDf = getTimeIntervalDf(stDf.groupby(["label"]).filter(lambda x: len(x) <= 100), 6)
    display(tempDf)
    for col in tempDf.columns:
        print(col, tempDf[col].sum())
foo()

In [None]:
def foo():
    tempDf = stDf.groupby(["label"]).filter(lambda x: len(x) <= 100)
    groups = tempDf.groupby(["label", "WEEKDAY"]).groups

    data = {}
    for label in tempDf.label.unique():
        s = {}
        for k, v in groups.items():
            otherLabel, weekday = k
            if (label == otherLabel):
                s[weekday] = len(v)
        
        data[label] = pd.Series(s, index=s.keys()).sort_index()
            
    df = pd.DataFrame(data).T
    df.index.name = "cluster"
    df.fillna(0, inplace=True)
        
    fig = px.imshow(df,
                    labels=dict(x="Weekday", y="Clusters", color="Frequency"),
                    x=df.columns,
                    y=[f"C {c}" for c in df.index],
                    width=900, aspect="auto",
                    height=800,
                    color_continuous_scale='RdBu_r')
    fig.update_xaxes(side="top")
    fig.show()
    
foo()

In [None]:
def foo():
    tempDf = getTimeIntervalDf(stDf.groupby(["label"]).filter(lambda x: len(x) <= 100), 2)
    display(tempDf.shape)
    
    fig = px.imshow(tempDf,
                    labels=dict(x="Intervals", y="Clusters", color="Frequency"),
                    x=[f"H_{c}" for c in tempDf.columns],
                    y=[f"C_{c}" for c in tempDf.index],
                    width=900, aspect="auto",
                    height=800,
                    color_continuous_scale='RdBu_r')
    fig.update_xaxes(side="top")
    fig.show()
    
foo()

In [None]:
@widgets.interact_manual
def knnGraph(k=(2,10,1), feature=["spatial", "temporal"]):
    from sklearn.neighbors import NearestNeighbors, DistanceMetric
    
    timeFeatures = ["OCCUR_HOUR", "OCCUR_MINUTE"]
    stDf = df[["Latitude", "Longitude"]]
    stDf["Latitude_radian"] = np.radians(stDf.Latitude)
    stDf["Longitude_radian"] = np.radians(stDf.Longitude)
    stDf["timestamp"] = getTimeDf(df, timeFeatures, AllTimeFeatures)
    stDf["timestamp_str"] = stDf["timestamp"].apply(str)
    stDf["timestamp_second"] = pd.to_datetime(stDf['timestamp']).astype(np.int64) / 10**9 
    # stDf.timestamp.apply(lambda x : (x - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s"))
    
#     import sklearn.neighbors
#     return sklearn.neighbors.VALID_METRICS['kd_tree']
    distances = []
    if (feature == "spatial"):
        x = stDf[["Latitude_radian", "Longitude_radian"]]
        neigh = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", metric="haversine")
        nbrs = neigh.fit(x)

        distances, indices = nbrs.kneighbors(x)
        distances = np.sort(distances, axis=0)[:, -1]
#         
#         distances = np.sort(distances, axis=0)

#         # Mean distance
#         distances = np.mean(distances, axis=1)
#         distances = np.sort(distances)
        distances = [d * 6371 for d in distances]
    else:
        x = stDf[["timestamp_second"]]
        neigh = NearestNeighbors(n_neighbors=k, algorithm="kd_tree", metric="minkowski")
        nbrs = neigh.fit(x)
        distances, indices = nbrs.kneighbors(x)
        distances = np.sort(distances, axis=0)[:,-1]
        
#         eps1X = stDf[["Latitude_radian", "Longitude_radian"]]
#         eps1Neigh = NearestNeighbors(n_neighbors=k, algorithm="ball_tree", metric="haversine")
        
#         eps2X = stDf[["timestamp_second"]]
#         eps2Neigh = NearestNeighbors(n_neighbors=k, algorithm="kd_tree", metric="euclidean")
        
#         eps1Nbrs = eps1Neigh.fit(eps1X)
#         eps1Distances, eps1Indices = eps1Nbrs.kneighbors(eps1X)       
        
#         for eps1Neighbors in eps1Indices:
#             x = eps2X.loc[eps1Neighbors]
#             nbrs = eps2Neigh.fit(x)
#             eps2Distances, _ = nbrs.kneighbors(x)

#             eps2Distances = np.sort(eps2Distances, axis=0)[:, -1]
#             distance = np.mean(eps2Distances)

# #             # Compute mean of each neighbor distance
# #             #eps2Distances = np.mean(eps2Distances, axis=1)
# #             eps2Distances = np.sort(eps2Distances, axis=0)
            
# #             distance = np.mean(eps2Distances, axis=0)[1]
            
# #             # Compute mean of all mean neighbor distances
# #             #distance = np.mean(eps2Distances)
#             distances.append(distance)
#         distances.sort()
        distances = [d / 3600 for d in distances]
    fig = px.line(distances)
    fig.show(config={"scrollZoom": True})

In [None]:
ut.geo.plotScatterOnMap(stDf.loc[[2668,797,2161,1743,966]], "Latitude", "Longitude", {"hover_data": ["timestamp_str"]})

In [None]:
for c in stDf.groupby(["label"])["label"].count().sort_values():
    print(c)

In [None]:
stDf.groupby(["label"]).agg({"timestamp": ["min", "max"]})

In [None]:
x = stDf.groupby(["label"]).filter(lambda x: len(x) <= 100)
x[x.apply(lambda x: x["timestamp"].hour >= 6 and x["timestamp"].hour <= 18, axis=1)]

In [None]:
x = runStDbscan(df, 1.7, ["OCCUR_DATE", "OCCUR_HOUR", "OCCUR_MINUTE"], "hour", 12, 4, False)

In [None]:
l = "15"
indices = x[x.label == l].index
barChart(df.loc[indices], hue=None, exportPath="", column="OCCUR_DATE", width=10, height=5, sortBy="label")

In [None]:
x[x.label == l].timestamp.value_counts().plot()

In [None]:
ut.heatmap.hv.extension("bokeh")
indices = x[x.label == l].index
ut.heatmap.freqHeatMap(df.loc[indices], "OCCUR_DATE", "OCCUR_HOUR", data["OCCUR_DATE"].unique(), data["OCCUR_HOUR"].unique())

In [None]:
import plotly.express as px

testDf = x[x.label == "22"]
testDf['hour'] = testDf['timestamp'].apply(lambda x: x.hour)
fig = px.scatter_3d(testDf, x="Latitude", y="Longitude", z="hour", color="hour")
fig.show()

In [None]:
indices = x[x.label == "02"].index
barChart(df.loc[indices], hue=None, exportPath="", column="OCCUR_HOUR", width=10, height=7, sortBy="label")

In [None]:
# lats = np.radians(df.Latitude).values
# lons = np.radians(df.Longitude).values
lats, lons, _, _ = utm.from_latlon(df.Latitude.values, df.Longitude.values)
time = df.OCCUR_HOUR.apply(lambda x: pd.to_datetime(f"{x}:00:00"))
x = pd.DataFrame({"lat": lats, "lon": lons, "time": time})


st = STDBSCAN(spatial_threshold=500.0, temporal_threshold=60.0, min_neighbors=15)
l = st.fit_transform(x, "lat", "lon", "time")

## KModes Analysis

In [None]:
from kmodes.kmodes import KModes
import plotly.express as px

In [None]:
kmodesPath = os.path.join(resultsPath, "kmodes")
if (not os.path.exists(kmodesPath)):
    os.makedirs(kmodesPath)
print(kmodesPath)

In [None]:
kmodesColumns = ['OCCUR_MONTH',
                 'OCCUR_HOUR',
                 'postcode',
                 'WEEKDAY']
nInit = 15

### Elbow method

In [None]:
@widgets.interact_manual
def kmodesElbowPlot(dataFrame=widgets.fixed(df[kmodesColumns]),
                    numClusters=(1, 20, 1), saveFigure=False):
    ''' '''
    # Get only categorical features
    elbowCaoPlot = ut.cluster.elbowMethod(dataFrame, numClusters, KModes, "Cao", init="Cao", n_init=nInit)
    elbowHuangPlot = ut.cluster.elbowMethod(dataFrame, numClusters, KModes, "Huang", init="Huang", n_init=nInit)
    elbowRandomPlot = ut.cluster.elbowMethod(dataFrame, numClusters, KModes, "random", init="random", n_init=nInit)
    
    elbowPlots = (elbowCaoPlot * elbowHuangPlot * elbowRandomPlot).opts(width=800, height=400, xlabel="Number of cluster", 
                                                                        ylabel="WCSS", title="Elbow Method")

    # Save the elbow method plot
    if (saveFigure):
        elbowPlotFilePath = os.path.join(kmodesPath, "kmodes_elbowMethod.html")
        ut.cluster.hv.save(elbowPlots, elbowPlotFilePath, toolbar=True)
        print(f"Saved to {elbowPlotFilePath}")
    
    return elbowPlots

### Run KModes

In [None]:
centroidsDf = None
combinedDf = None
clusterLabels = None

@widgets.interact_manual
def loadCentroids(dataFrame=widgets.fixed(df[kmodesColumns]), kmodeInit=["Huang", "Cao", "random"],
                  numClusters=(1, 10, 1), loadFromFile=True, saveDfs=False):
    ''' '''
    global centroidsDf
    global combinedDf
    global clusterLabels

    centroidsFilePath = os.path.join(kmodesPath, "kmodes_centroids.csv")
    combinedFilePath = os.path.join(kmodesPath, "kmodes_clusters.csv")
    
    if (loadFromFile):
        # Load data from saved files
        centroidsDf = pd.read_csv(centroidsFilePath, header=0, sep=",", dtype=str)
        centroidsDf["Number of Samples"] = pd.to_numeric(centroidsDf["Number of Samples"])

        clusterDf = pd.read_csv(combinedFilePath, index_col=0, header=0, sep=",")
        combinedDf = dataFrame.loc[clusterDf.index]
        combinedDf["cluster"] = clusterDf["cluster"]
    else:
        km = KModes(n_clusters=numClusters, init=kmodeInit, n_init=nInit, verbose=False)
        preds = km.fit_predict(dataFrame)
        centroidsDf = ut.cluster.createCentroidDf(km.cluster_centroids_, dataFrame.columns, preds)

        combinedDf = dataFrame.copy(True)
        combinedDf["cluster"] = preds
        
        if (saveDfs):
            centroidsDf.to_csv(centroidsFilePath, index=False)
            print(f"Saved centroid to {centroidsFilePath}")
            
            combinedDf.to_csv(combinedFilePath, columns=["cluster"])
            print(f"Saved clusters to {combinedFilePath}")

    clusterLabels = sorted(centroidsDf.index)
        
    return centroidsDf

### Frequent Patterns

In [None]:
@widgets.interact
def foo(clusterLabel=clusterLabels):
    clusterDf = combinedDf[combinedDf["cluster"] == clusterLabel]
    return widgets.interactive(frequentPatterns, {'manual': True}, dataFrame=widgets.fixed(clusterDf))

In [None]:
frequentPatterns(combinedDf[combinedDf["cluster"] == 1], columns, "0.16", "fpgrowth", 4)

### Cluster Cardinality vs. Magnitude

In [None]:
@widgets.interact_manual
def kmodesCardinalityVsMagnitude(dataFrame=widgets.fixed(combinedDf), saveFigure=False):
    ''' '''
    fig, axes = plt.subplots(3, 1, figsize=(20, 18))

    # Cardinality
    cardinalities = dataFrame["cluster"].value_counts().sort_index()
    cardinalities.plot.bar(ax=axes[0])
    axes[0].set_xlabel("Cluster")
    axes[0].set_ylabel("Number of samples")
    axes[0].set_title("Cluster Cardinality")

    # Magnitude
    clusterMagnitudes = dict([(cluster, 0.0) for cluster in centroidsDf.index])
    for i, r in dataFrame.iterrows():
        cluster = r["cluster"]
        clusterMagnitudes[cluster] += ut.cluster.dissimilarDistance(r[kmodesColumns], 
                                                                    centroidsDf.iloc[cluster][kmodesColumns])

    magnitudeDf = pd.DataFrame(clusterMagnitudes.items(), columns=["cluster", "magnitude"])
    magnitudes = magnitudeDf["magnitude"]

    magnitudeDf.plot.bar("cluster", color="green", ax=axes[1])
    axes[1].set_xlabel("Cluster")
    axes[1].set_ylabel("Cluster magnitude")
    axes[1].set_title("Cluster Magnitude")

    # Plot cardinality vs magnitude
    clusterSizes = [centroidsDf.iloc[i]["Number of Samples"] for i in range(len(centroidsDf))]

    axes[2].scatter(cardinalities, magnitudes, s=clusterSizes)
    axes[2].set_title("Cluster Cardinality vs Magnitude")
    axes[2].set_xlabel("Cardinality")
    axes[2].set_ylabel("Magnitude")

    # Plot best fit line
    x = np.unique(cardinalities)
    y = np.poly1d(np.polyfit(cardinalities, magnitudes, 1))(np.unique(cardinalities))
    axes[2].plot(x, y, label="Best fit line", color="blue")
    axes[2].legend()

    # Annotate each cluster point with its coordinate
    s = 1
    for i, cluster in enumerate(sorted(centroidsDf.index)):
        axes[2].annotate(cluster, xy=(cardinalities[i]+s, magnitudes[i]+s))
        
    # Save the figure
    if (saveFigure):
        cardMagFilePath = os.path.join(kmodesPath, "kmodes_cardinality_vs_magnitude.png")
        fig.savefig(cardMagFilePath)
        print(f"Saved to {cardMagFilePath}")

### Plot clusters on map

#### Plot points on map

In [None]:
@widgets.interact_manual
def kmodesPlotPointsOnMap(savePlot=False):
    ''' '''
    tempDf = combinedDf.assign(Latitude=df.Latitude).assign(Longitude=df.Longitude)
    tempDf["cluster"] = tempDf["cluster"].apply(lambda x: f"Cluster {x}")
    
    fig = px.scatter_mapbox(tempDf, lat="Latitude", lon="Longitude", zoom=10, color="cluster",
                            hover_name="cluster", hover_data=tempDf.columns)
    fig.update_layout(title_text = 'KModes Clusters',
                      geo_scope='usa',
                      margin={"r":0,"l":0,"b":0},
                      mapbox_style = "carto-darkmatter")
    fig.show()
    
    if (savePlot):
        plotFilePath = os.path.join(kmodesPath, "kmodes_points.html")
        ut.geo.plotly.offline.plot(fig, filename=plotFilePath, auto_open=False)
        print(f"Saved to {plotFilePath}")
    
    del tempDf

#### Plot precinct choropleths

In [None]:
# clusterDf = combinedDf[combinedDf["cluster"] == 1]

# # Get the precincts from the shooting crime data
# precincts = dict(clusterDf["PRECINCT"].value_counts())
# precinctsDf = pd.DataFrame(precincts.items(), columns=["precinct", "count"])

# f = px.choropleth_mapbox(precinctsDf, 
#                              geojson=geoJson, 
#                              locations="precinct", 
#                              color="count",
#                              mapbox_style="carto-darkmatter",
#                              center={"lat": newYorkCoord[0], "lon": newYorkCoord[1]})
# f.update_geos(fitbounds="locations", visible=True)
# f.update_layout(margin={"r":0,"l":0,"b":0}, title_text="Test")

# newFig.add_traces([f])



# newFig.add_choroplethmapbox(geojson=geoJson, locations=['73', '42'], z=[12, 45],
#                             coloraxis="coloraxis", hoverinfo="all", name="Cluster 2",
#                             showlegend=True,
#                             text="text param")
                            #featureidkey="properties.precinct")

# newFig.add_choropleth(geojson=geoJson, locations=['75', '47'], z=[12, 30],
#                             featureidkey="properties.precinct")

In [None]:
@widgets.interact_manual
def kmodesPrecinctChoropleth(dataFrame=widgets.fixed(combinedDf), saveFigure=False):
    ''' '''
    figs = []
    feature = "postcode"
    for cluster in clusterLabels:
        clusterDf = dataFrame[dataFrame["cluster"] == cluster]
        colorscale = clusterColorscales[cluster]

        precinctsDf = pd.DataFrame(clusterDf[feature].value_counts().items(), columns=[feature, "count"])
        fig = ut.geo.plotPlotly(geoJsons[feature], precinctsDf, feature, "count", title=f"Cluster {cluster}", 
                                center={"lat": newYorkCoord[0], "lon": newYorkCoord[1]}, 
                                zoom=9, colorscale=colorscale, labels={'count':'number of incidents'})
        
        display(fig)
        figs.append(fig)
        
    # Save each choropleth figure
    if (saveFigure):
        choroplethsFolderPath = os.path.join(kmodesPath, "choropleths")
        if (not os.path.exists(choroplethsFolderPath)):
            os.makedirs(choroplethsFolderPath)
            
        for i, fig in enumerate(figs):
            choroplethFilePath = os.path.join(choroplethsFolderPath, f"choropleth_{i}.html")
            ut.geo.plotly.offline.plot(fig, filename=choroplethFilePath, auto_open=False)
            print("Saved to", choroplethFilePath)     

### Cramers V & Theil Uncertainty

In [None]:
@widgets.interact_manual
def kmodessCorrelation(correlationMethod=["Cramers V", "Theil Uncertainty"], 
                      width=(5, 100, 1), height=(5, 100, 1), saveFigure=False):
    ''' '''
    correlationFunc = "cramersV" if (correlationMethod == "Cramers V") else "theilU"
    numPlots = len(clusterLabels)
    fig, axes = plt.subplots(numPlots, 1, figsize=(width, height))

    # Construct a matrix of correlation values for each cluster
    for i in range(numPlots):
        clusterDf = combinedDf[combinedDf["cluster"] == i]
        clusterCramersVDf = df.loc[clusterDf.index][allCatColumns]
        
        correlationDf = ut.metrics.getCorrelationDf(clusterCramersVDf, correlationFunc)
        sns.heatmap(correlationDf, annot=True, ax=axes[i])
        axes[i].set_title(f"Cluster {clusterLabels[i]}")
        
    fig.tight_layout()
    
    if (saveFigure):
        correlationFilePath = os.path.join(kmodesPath, f"kmodes_{correlationMethod}.png")
        fig.savefig(correlationFilePath)
        print("Saved to", correlationFilePath)

### Frequency Heatmap

In [None]:
@widgets.interact_manual
def kmodesHeatmap(dataFrame=widgets.fixed(df), 
                  clustersSeries=widgets.fixed(combinedDf["cluster"]),
                  c1=allCatColumns, c2=allCatColumns, 
                  width=(200, 1200, 50), height=(200, 1500, 50), numCols=(1, 3, 1), saveFigure=False):
    ''' '''
    dataFrame["cluster"] = clustersSeries
    clusterDf = dataFrame[dataFrame["cluster"] == clusterLabels[0]].reset_index(drop=True)
    
    commonArgs = [c1, c2, data[c1].unique(), data[c2].unique()]
    commonKwArgs = {"width": width, "height": height, "cmap": "magma", "tools": ["hover"]}
    
    # Add each cluster heatmap to layout object
    layout = ut.heatmap.freqHeatMap(clusterDf, title=f"Cluster {clusterLabels[0]}", *commonArgs, **commonKwArgs)
    
    for c in clusterLabels[1:]:
        clusterDf = dataFrame[dataFrame["cluster"] == clusterLabels[c]].reset_index(drop=True)
        layout += ut.heatmap.freqHeatMap(clusterDf, title=f"Cluster {c}", *commonArgs, **commonKwArgs)
     
    # Display the layout based on the number of colors
    layout = layout.opts(title=f"Frequency {c1} by {c2}").cols(numCols)
    
    if (saveFigure):
        heatmapFolderPath = os.path.join(kmodesPath, "heatmap_frequency")
        if (not os.path.exists(heatmapFolderPath)):
            os.makedirs(heatmapFolderPath)
            
        heatmapFilePath = os.path.join(heatmapFolderPath, f"Frequency_{c1}_by_{c2}.html")
        ut.heatmap.hv.save(layout, heatmapFilePath)
        print("Saved to", heatmapFilePath)

    return layout

In [None]:
@widgets.interact_manual
def saveHeatmapsWidget():
    ''' '''
    saveHeatmaps(kmodesHeatmap, data, columns=allCatColumns, 
                 dataFrame=df, clustersSeries=combinedDf["cluster"],
                 width=700, height=1500, 
                 saveFigure=True, fixed2ndColumns=["OCCUR_HOUR", "OCCUR_MONTH", "OCCUR_DATE", "WEEKDAY"], numCols=2)

### Column Frequencies

In [None]:
@widgets.interact_manual
def kmodesColumnFreq(column=allCatColumns, width=(5, 25, 1), height=(5, 25, 1), saveFigure=False):
    ''' '''
    tempDf = df.assign(cluster=combinedDf["cluster"])
    barChart(tempDf, hue=combinedDf['cluster'], exportPath=kmodesPath, column=column,
             width=width, height=height, sortBy="label", saveFigure=saveFigure)

In [None]:
@widgets.interact_manual
def saveKmodesColumnFreqs():
    # Save the column frequency plot for each column
    for col in allCatColumns:
        kmodesColumnFreq(col, 50, 15, True)

### Decision Tree

In [None]:
from models.supervised import DecisionTree
from sklearn.model_selection import train_test_split

In [None]:
@widgets.interact_manual
def runDt(dataFrame=widgets.fixed(combinedDf),
          n=(1, 10, 1), depth=(0, 10, 1), splitFeatures=(1, combinedDf.shape[1]-1, 1),
          testSize=(0.1, 0.9, 0.1), saveFeatureImportance=False, saveGraph=False):
    ''' '''
    
    dts = []
    featureImportances = []
    for i in range(n):
        # Create train and test samples
        xTrain, xTest, yTrain, yTest = train_test_split(dataFrame.drop("cluster", axis=1), 
                                                        dataFrame["cluster"], test_size=testSize)
        
        # Train and classify
        dt = DecisionTree(maxDepth=depth, numberFeaturesToSplit=splitFeatures)
        dt.train(xTrain, yTrain, quiet=True)
        preds = dt.classify(xTest)
        classificationError = ut.metrics.computeError(preds['Prediction'], yTest) * 100
        
        display(dt.featureImportance)
        print(f"Misclassification error: {classificationError:.2f}%")
        
        featureImportances.append((classificationError, dt.featureImportance))
        dts.append(dt)
        
    # Save each feature importance in the same .csv file
    dtFolderPath = os.path.join(kmodesPath, "dt")
    if (saveFeatureImportance):
        if (not os.path.exists(dtFolderPath)):
            os.makedirs(dtFolderPath)
            
        # Remove exiting file only if we want to save new data
        featImpFilePath = os.path.join(dtFolderPath, "dt_feature_importance.csv")
        if (os.path.exists(featImpFilePath)):
            os.remove(featImpFilePath)

        for classErr, fi in featureImportances:
            fi.to_csv(featImpFilePath, mode="a", index=False)
        
            with open(featImpFilePath, "a") as file:
                file.write(f"Misclassification error, {classErr}%\n\n")

        print("Saved to", featImpFilePath)
        
    if (saveGraph):
        if (not os.path.exists(dtFolderPath)):
            os.makedirs(dtFolderPath)
            
        for i, dt in enumerate(dts):
            dtFilePath = os.path.join(dtFolderPath, f"dt_{i}.png")
            dt.save(dtFilePath, "png")

## KMeans Analysis

In [None]:
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [None]:
kmeansPath = os.path.join(resultsPath, "kmeans")
if (not os.path.exists(kmeansPath)):
    os.makedirs(kmeansPath)
print(kmeansPath)

### Transfrom the data frame into unstack data frame

In [None]:
kmeansColumns = ['OCCUR_MONTH',
                 'OCCUR_HOUR',
                 'postcode',
                 'WEEKDAY']
# kmeansColumns = allCatColumns

In [None]:
unstackDfs = {}
for col in kmeansColumns:
    unstackDfs[col] = ut.metrics.groupByAndUnstack(col, df[kmeansColumns])

In [None]:
udfColumn = "postcode"
udf = unstackDfs[udfColumn]

# Save to file
udf.to_csv(os.path.join(kmeansPath, "udf.csv"), index=False)
print("Saved to", os.path.join(kmeansPath, "udf.csv"))

print(udf.shape)
udf.head(3)

### PCA

In [None]:
@widgets.interact_manual
def plotPca(saveFigure=False):
    ''' '''
    numPcs = min(udf.shape)
    pcaPlot = ut.metrics.plotPcaVarianceRatio(numPcs, udf)
    pcs = ut.metrics.getPcByVariancePercentInterval(0.90, 0.97, numPcs, udf)
    
    if (saveFigure):
        pcaFilePath = os.path.join(kmeansPath, "kmeans_pca_variance_ratios.html")
        ut.metrics.hv.save(pcaPlot, pcaFilePath, toolbar=True)
        print("Saved to", pcaFilePath)
        
    display(pcaPlot)
    display(pcs)

In [None]:
# PCA
nComps = 20
pca = PCA(n_components=nComps)
pcDf = pd.DataFrame(pca.fit_transform(udf), index=udf.index,
                    columns=[f"PC_{i+1}" for i in range(len(pca.components_))])

print([f"{cs*100:.2f}%" for cs in np.cumsum(pca.explained_variance_ratio_)])
print(pcDf.shape)
pcDf.head(3)

In [None]:
importantColumns = []
for i in range(nComps):
    highPcValueIndex = np.abs(pca.components_[i]).argmax()
    pcValue = pca.components_[i][highPcValueIndex]
    highInfluenceCol = udf.columns[highPcValueIndex]
    
    if (pcValue > 0):
        importantColumns.append((highInfluenceCol, "+"))
    elif (pcValue < 0):
        importantColumns.append((highInfluenceCol, "-"))
    else:
        importantColumns.append((highInfluenceCol,))
    
importantColumns

### Run KMeans

In [None]:
from sklearn.cluster import KMeans

#### Elbow Method

In [None]:
@widgets.interact_manual
def kmeansElbowPlot(numClusters=(1, 20, 1), nInit=(1,20,1), saveFigure=False):
    ''' '''
    elbowKmeansPlot = ut.cluster.elbowMethod(pcDf, numClusters, KMeans, n_init=nInit)
    
    # Save the elbow method plot
    if (saveFigure):
        elbowPlotFilePath = os.path.join(kmeansPath, "kmeans_elbowMethod.html")
        ut.cluster.hv.save(elbowKmeansPlot, elbowPlotFilePath, toolbar=True)
        print(f"Saved to {elbowPlotFilePath}")

    return elbowKmeansPlot

#### Silhouette Scores

In [None]:
@widgets.interact_manual
def kmeansSilScores(numClusters=(2, 10, 1), nInit=(1,20,1),
                    width=(5, 30, 1), height=(5, 30, 1), 
                    saveFigure=False):
    ''' '''
    clusterRange = [i for i in range(2, numClusters + 1)]
    fig, axes = plt.subplots(len(clusterRange), 1, figsize=(width, height))
    
    for i, c in enumerate(clusterRange):
        kmeans = KMeans(n_clusters=c, n_init=nInit)
        preds = kmeans.fit_predict(pcDf)
        
        axis = axes if (len(clusterRange) == 1) else axes[i]
        axis.set_title(f"Number of clusters {c}")
        ut.cluster.plotSilhouetteScores(c, pcDf, preds, ax=axis, xRange=[-0.3, 1])

    # Save the elbow method plot
    if (saveFigure):
        figFilePath = os.path.join(kmeansPath, "kmeans_silhouette_scores.png")
        fig.savefig(figFilePath)
        print(f"Saved to {figFilePath}")

#### Cluster PCA Data Frame

In [None]:
ut.cluster.hv.extension("plotly")

In [None]:
@widgets.interact_manual
def kmeansPcaClusters(dataFrame=widgets.fixed(pcDf), 
                      c1=pcDf.columns, c2=pcDf.columns, c3=pcDf.columns,
                      numClusters=(1, len(colors), 1), nInit=(1,20,1), saveFigure=False):
    ''' '''
    numCols = dataFrame.shape[1]
    if (numCols == 2):
        if (c1 == c2):
            raise ValueError("All columns must be uniquely different")
    else:
        if (c1 == c2 or c2 == c3 or c1 == c3):
            raise ValueError("All columns must be uniquely different")
    cols = [c1, c2, c3][:numCols]
    
    # Include the cluster label and its associated color to the title
    # because there is no way to show legends in plotly via holoviews
    title = f"{udfColumn} KMeans - [0: {colors[0]}"
    for i in range(numClusters)[1:]:
        title += f", {i}: {colors[i]}"
    title += "]"
        
    kmeansClusterPlot = ut.cluster.plotKmeansClusters(pcDf[cols], numClusters, colors, title, n_init=nInit)
    
    if (saveFigure):
        kmeansClusterFilePath = os.path.join(kmeansPath, f"kmeans_cluster_{numClusters}.html")
        ut.cluster.hv.save(kmeansClusterPlot, kmeansClusterFilePath)
        print("Saved to", kmeansClusterFilePath)

    return kmeansClusterPlot

In [None]:
def saveKmeansPcaClusters(numCombs, dataFrame, numClusters):
    ''' '''
    if (numCombs > 3 or numCombs < 1):
        raise ValueError("numCombs must be between 1 and 3 inclusively")
        
    from itertools import combinations
    
    # Generate different combinations
    for comb in combinations(dataFrame.columns, numCombs):
        # Include the cluster label and its associated color to the title
        # because there is no way to show legends in plotly via holoviews
        title = f"{udfColumn} KMeans - [0: {colors[0]}"
        for i in range(numClusters)[1:]:
            title += f", {i}: {colors[i]}"
        title += "]"

        kmeansClusterPlot = ut.cluster.plotKmeansClusters(dataFrame[list(comb)], numClusters, 
                                                          colors, title, n_init=20)
        
        kmeansClusterFolderPath = os.path.join(kmeansPath, "clusters")
        if (not os.path.exists(kmeansClusterFolderPath)):
            os.makedirs(kmeansClusterFolderPath)
            
        kmeansClusterFilePath = os.path.join(kmeansClusterFolderPath, f"{numClusters}_clusters_{'-'.join(comb)}.html")
        ut.cluster.hv.save(kmeansClusterPlot, kmeansClusterFilePath)
        print("Saved to", kmeansClusterFilePath)
    gc.collect()

In [None]:
@widgets.interact_manual
def saveClusters(dataFrame=widgets.fixed(pcDf), numCombs=(1, 3, 1), numClusters=(1, len(colors), 1)):
    ''' '''
    saveKmeansPcaClusters(numCombs, dataFrame, numClusters)

#### Scatter Matrix

In [None]:
@widgets.interact_manual
def scatterMatrix(dataFrame=widgets.fixed(pcDf), width=(5, 30, 1), height=(5, 30, 1), saveFigure=False):
    ''' '''
    # Plot the scatter matrix
    pd.plotting.scatter_matrix(dataFrame, alpha=1, figsize=(width, height), diagonal="kde")
    plt.suptitle("Scatter Matrix")
    
    if (saveFigure):
        filePath = os.path.join(kmeansPath, f"scatter_matrix.png")
        plt.savefig(filePath)
        print("Saved to", filePath)

#### KMeans Outputs

In [None]:
# Run KMeans
nClusters = 2
kmeans = KMeans(n_clusters=nClusters)
preds = kmeans.fit_predict(pcDf)

# Construct the prediction data frame to include a color column
clustersDf = pcDf.copy(True)
clustersDf["cluster"] = preds

print(clustersDf.shape)
display(clustersDf.head(3))

# Get mapping from cluster to list of data frame indices
clustersToColValues = {}
for i in range(nClusters):
    clustersToColValues[i] = clustersDf[clustersDf["cluster"] == i].index.values
clustersToColValues

##### Plot precinct choropleths

In [None]:
@widgets.interact_manual
def kmeansPrecinctChoropleth(feature=["PRECINCT", "postcode", "neighbourhood"], saveFigure=False):
    ''' '''
    m = None
    clusterPrecinctsDfs = []
    for i in range(nClusters):
        colValues = clustersToColValues[i]
        tempDf = df[df[udfColumn].isin(colValues)]
        clusterPrecincts = dict(tempDf[feature].value_counts())
        clusterPrecinctsDf = pd.DataFrame(clusterPrecincts.items(), columns=[feature, "count"])

        # Display precinct choropleths on the same map layer
        m = ut.geo.plotLeaflet(geoJsons[feature], clusterPrecincts, m, includeEmpty=False, 
                               heatmapArgs={"name": f"Cluster {i}", "colormap": clusterColormaps[i]},
                               mapArgs={"center": newYorkCoord, "zoom": 10, 
                                        "basemap": ut.geo.lf.basemaps.CartoDB.DarkMatter},
                               labelFormat=f"{feature}={0} - Count={1}")
        
        # Store these data frames so that they may be used during the "save" operation
        clusterPrecinctsDfs.append(clusterPrecinctsDf)

    # Save each figure
    if (saveFigure):
        choroplethsFolderPath = os.path.join(kmeansPath, "choropleths")
        if (not os.path.exists(choroplethsFolderPath)):
            os.makedirs(choroplethsFolderPath)
            
        for i in range(nClusters):
            # Save each choropleth separately
            colorscale = clusterColorscales[i]
            fig = ut.geo.plotPlotly(geoJsons[feature], clusterPrecinctsDfs[i], feature, "count", title=f"Cluster {i}", 
                                    center={"lat": newYorkCoord[0], "lon": newYorkCoord[1]}, 
                                    zoom=9, colorscale=colorscale, labels={'count':'number of incidents'})
            
            choroplethFilePath = os.path.join(choroplethsFolderPath, f"choropleth_{feature}_{i}.html")
            ut.geo.plotly.offline.plot(fig, filename=choroplethFilePath, auto_open=False)
            print("Saved to", choroplethFilePath)    

    return m

##### Cluster Cramer V & Theil Uncertainty

In [None]:
@widgets.interact_manual
def kmeansCorrelation(dataFrame=widgets.fixed(df[allCatColumns]),
                      correlationMethod=["Cramers V", "Theil Uncertainty"],
                      width=(5, 100, 1), height=(5, 100, 1), saveFigure=False):
    ''' '''
    correlationFunc = "cramersV" if (correlationMethod == "Cramers V") else "theilU"
    fig, axes = plt.subplots(nClusters, 1, figsize=(width, height))
    
    # Construct a matrix of cramers V values for each cluster
    for i in range(nClusters):
        colValues = clustersToColValues[i]
        tempDf = dataFrame[dataFrame[udfColumn].isin(colValues)]
        
        correlationDf = ut.metrics.getCorrelationDf(tempDf, correlationFunc)
        sns.heatmap(correlationDf, annot=True, ax=axes[i])
        axes[i].set_title(f"Cluster {i}")
    
    fig.tight_layout()
    
    if (saveFigure):
        correlationFilePath = os.path.join(kmeansPath, f"kmeans_{correlationMethod}.png")
        fig.savefig(correlationFilePath)
        print("Saved to", correlationFilePath)

##### Plot clusters heatmap frequency

In [None]:
@widgets.interact_manual
def kmeansHeatmap(dataFrame=widgets.fixed(df[allCatColumns]), 
                  clustersToColValues=widgets.fixed(clustersToColValues), 
                  c1=allCatColumns, c2=allCatColumns, width=(200, 1200, 50), height=(200, 1200, 50), 
                  numCols=(1, 3, 1), saveFigure=False):
    ''' '''
    ut.heatmap.hv.extension("bokeh")
    
    commonArgs = [c1, c2, data[c1].unique(), data[c2].unique()]
    commonKwArgs = {"width": width, "height": height, "cmap": "magma", "tools": ["hover"]}
    
    colValues = clustersToColValues[0]
    tempDf = dataFrame[dataFrame[udfColumn].isin(colValues)]
    
    layout = ut.heatmap.freqHeatMap(tempDf, title=f"{udfColumn} - Cluster 0", *commonArgs, **commonKwArgs)    
    for i in range(nClusters)[1:]:    
        colValues = clustersToColValues[i]
        tempDf = dataFrame[dataFrame[udfColumn].isin(colValues)]
        layout += ut.heatmap.freqHeatMap(tempDf, title=f"{udfColumn} - Cluster {i}", *commonArgs, **commonKwArgs)    
    layout = layout.cols(numCols)
    
    if (saveFigure):
        heatmapFolderPath = os.path.join(kmeansPath, "heatmap_frequency")
        if (not os.path.exists(heatmapFolderPath)):
            os.makedirs(heatmapFolderPath)
            
        heatmapFilePath = os.path.join(heatmapFolderPath, f"Frequency_{c1}_by_{c2}.html")
        ut.heatmap.hv.save(layout, heatmapFilePath)
        print("Saved to", heatmapFilePath)        

    return layout

In [None]:
@widgets.interact_manual
def saveHeatmapsWidget():
    ''' '''
    saveHeatmaps(kmeansHeatmap, data, columns=allCatColumns, 
                 dataFrame=df[allCatColumns], width=650, height=450, saveFigure=True,
                 clustersToColValues=clustersToColValues, numCols=2,
                 fixed2ndColumns=["OCCUR_HOUR", "OCCUR_MONTH", "OCCUR_DATE", "WEEKDAY"])

##### Column Frequencies

In [None]:
@widgets.interact_manual
def kmeansColumnFreq(dataFrame=widgets.fixed(df[allCatColumns]),
                     column=allCatColumns, width=(5, 25, 1), height=(5, 25, 1), saveFigure=False):
    ''' '''
    # Add cluster label column
    tempDf = dataFrame[dataFrame.columns]
    tempDf["cluster"] = 0
    for cluster, values in clustersToColValues.items():
        indices = tempDf[tempDf[udfColumn].isin(values)].index.values
        tempDf.loc[indices, "cluster"] = cluster
        
    barChart(tempDf, hue=tempDf['cluster'], exportPath=kmeansPath, column=column,
             width=width, height=height, sortBy="label", saveFigure=saveFigure)

In [None]:
@widgets.interact_manual
def saveKmeansColumnFreqs():
    # Save the column frequency plot for each column
    for col in allCatColumns:
        kmeansColumnFreq(df[allCatColumns], col, 25, 15, True)

In [None]:
def nMostCount(series, n):
    ''' Return a series which displays top n values'''
    sortedSeries = series.value_counts(ascending=False)[:n]
    tuples = [f"{value} - Count: {count}" for value, count in sortedSeries.iteritems()]
    newSeries = pd.Series(tuples)
    
    return newSeries

In [None]:
@widgets.interact_manual
def mostCountDf(n=(1, 10, 1)):
    ''' Display n top values of each column in the data frame '''
    for i in range(nClusters):
        display(f"Cluster {i}")
        clusterMostCountDf = df[df[udfColumn].isin(clustersToColValues[i])].apply(lambda col: nMostCount(col, n))
        display(clusterMostCountDf)

## KMeans with one-hot encodings

In [None]:
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.decomposition import PCA

In [None]:
kmeansOhePath = os.path.join(resultsPath, "kmeans_ohe")
if (not os.path.exists(kmeansOhePath)):
    os.makedirs(kmeansOhePath)
print(kmeansOhePath)

In [None]:
kmeansOheColumns = ['OCCUR_MONTH',
                    'OCCUR_HOUR',
                    'WEEKDAY',
                    'Latitude',
                    'Longitude']

### PCA

In [None]:
@widgets.interact_manual
def kmeansOhePlotPca(dataFrame=widgets.fixed(df[kmeansOheColumns]), saveFigure=False):
    ''' '''
    dummiesDf = pd.get_dummies(dataFrame)
    print(dataFrame.columns)
    print(dummiesDf.columns)
    numPcs = min(dummiesDf.shape)
    pcaPlot = ut.metrics.plotPcaVarianceRatio(numPcs, dummiesDf)
    pcs = ut.metrics.getPcByVariancePercentInterval(0.90, 0.97, numPcs, dummiesDf)
    
    if (saveFigure):
        pcaFilePath = os.path.join(kmeansOhePath, "kmeans_ohe_pca_variance_ratios.html")
        ut.metrics.hv.save(pcaPlot, pcaFilePath, toolbar=True)
        print("Saved to", pcaFilePath)
        
    display(pcaPlot)
    display(pcs)

### Elbow Method

In [None]:
def foo():
    import math
    
    def bar(hour1, hour2):
        from itertools import cycle, islice, dropwhile
        timeRange = np.arange(0, 24)
        
        cycled = cycle(timeRange) 
        result1 = list(islice(dropwhile(lambda x: x != hour1, cycled), None, len(timeRange)))
        result2 = list(islice(dropwhile(lambda x: x != hour2, cycled), None, len(timeRange)))
        
        d1 = abs(result1.index(hour1) - result1.index(hour2))
        d2 = abs(result2.index(hour1) - result2.index(hour2))
        
        return min(d1, d2)
    
    euclidean = lambda x,y: math.sqrt(sum([(i-j)**2 if (index != 2) else bar(i,j)**2 for index, (i,j) in enumerate(zip(x,y))]))
        
    
    tempDf = df[["Latitude", "Longitude", "OCCUR_HOUR"]][:3].astype({"OCCUR_HOUR": int})
#     tempDf.drop("OCCUR_HOUR", axis=1, inplace=True)
    display(tempDf)
    for i in range(tempDf.shape[0]):
        for j in range(i+1, tempDf.shape[0]):
            x, y = tempDf.iloc[i], tempDf.iloc[j]
            d = euclidean(x, y)
            print(f"{i} <---> {j} = {d} - {bar(x.OCCUR_HOUR, y.OCCUR_HOUR)}")
            
    fig = px.scatter_mapbox(tempDf, lat="Latitude", lon="Longitude", hover_name=tempDf.index, hover_data=["OCCUR_HOUR"])
    fig.update_layout(
            title_text = 'Shootings',
            geo_scope='usa',
            margin={"r":0,"l":0,"b":0},
            mapbox_style = "carto-darkmatter")
    fig.show()

foo()

In [None]:
@widgets.interact_manual
def kmeansOheElbowPlot(dataFrame=widgets.fixed(df[kmeansOheColumns]), 
                       numClusters=(1, 20, 1), nInit=(1,20,1), saveFigure=False):
    ''' '''
    dummiesDf = pd.get_dummies(dataFrame)
    elbowPlot = ut.cluster.elbowMethod(dummiesDf, 15, KMeans, printInfo=False, n_init=nInit)
    
    # Save the elbow method plot
    if (saveFigure):
        elbowPlotFilePath = os.path.join(kmeansOhePath, "kmeans_ohe_elbowMethod.html")
        ut.cluster.hv.save(elbowPlot, elbowPlotFilePath)
        print(f"Saved to {elbowPlotFilePath}")
    
    return elbowPlot

### Silhouette Score

In [None]:
@widgets.interact_manual
def kmeansOheSilScores(dataFrame=widgets.fixed(df[kmeansOheColumns]), numClusters=(2,20,1), nInit=(1,20,1), 
                       width=(5, 30, 1), height=(5, 30, 1), saveFigure=False):
    ''' '''
    dummiesDf = pd.get_dummies(dataFrame)
    clusterRange = [i for i in range(2, numClusters + 1)]
    fig, axes = plt.subplots(len(clusterRange), 1, figsize=(width, height))
    
    for i, c in enumerate(clusterRange):
        kmeans = KMeans(n_clusters=c, n_init=nInit)
        preds = kmeans.fit_predict(dummiesDf)
        
        axis = axes if (len(clusterRange) == 1) else axes[i]
        axis.set_title(f"Number of clusters {c}")
        ut.cluster.plotSilhouetteScores(c, dummiesDf, preds, ax=axis, xRange=[-0.2, 1])
    
    fig.tight_layout()
    
    # Save sil scores as picture
    if (saveFigure):
        figFilePath = os.path.join(kmeansOhePath, "kmeans_ohe_silhouette_scores.png")
        fig.savefig(figFilePath)
        print(f"Saved to {figFilePath}")

### Plot Clusters

In [None]:
kmeansOheClusters = None

In [None]:
@widgets.interact_manual
def kmeansOhePlotClusters(dataFrame=widgets.fixed(df[kmeansOheColumns]), 
                          numClusters=(1,15,1), nInit=(1,20,1), saveFigure=False):
    ''' '''
    dummiesDf = pd.get_dummies(dataFrame)
    print(dummiesDf.shape)

    # Run KMeans
    kmeans = KMeans(n_clusters=numClusters, n_init=nInit)
    preds = kmeans.fit_predict(dummiesDf)
    dataFrame = dataFrame.assign(cluster=[f"Cluster {p}" for p in preds]) #[colors[pred] for pred in preds] 
    display(dataFrame["cluster"].value_counts())
    
    #
    global kmeansOheClusters
    kmeansOheClusters
    
    # Plot on map
    fig = px.scatter_mapbox(dataFrame, lat="Latitude", lon="Longitude", 
                            zoom=10, color="cluster", hover_data=kmeansOheColumns)
    fig.update_layout(
            title_text = 'Shootings',
            geo_scope='usa',
            margin={"r":0,"l":0,"b":0},
            mapbox_style = "carto-darkmatter")

    fig.show()
    
    if (saveFigure):
        plotFilePath = os.path.join(kmeansOhePath, "kmeans_ohe_clusters.html")
        ut.geo.plotly.offline.plot(fig, filename=plotFilePath, auto_open=False)
        print(f"Saved to {plotFilePath}")
        
        clusterFilePath = os.path.join(kmeansOhePath, "kmeans_ohe_clusters.csv")
        dataFrame.to_csv(clusterFilePath, columns=["cluster"])
        print(f"Saved to {clusterFilePath}")

### Heatmap Frequency

## KPrototype

In [None]:
from kmodes.kprototypes import KPrototypes

In [None]:
@widgets.interact_manual
def kPrototypeElbow(prototypeDf=widgets.fixed(df[["OCCUR_MONTH", "OCCUR_HOUR", "PRECINCT", "WEEKDAY", "Latitude", "Longitude"]]),
                    numClusters=(1, 20, 1), 
                    nInit=(1, 20, 1), 
                    saveFigure=False):
    ''' '''
    catFeatureIndices = list(map(lambda x: prototypeDf.columns.get_loc(x), prototypeDf.select_dtypes(include=["object", "bool", "category"]).columns))
    elbowCaoPlot = ut.cluster.elbowMethod(prototypeDf, numClusters, KPrototypes, "Cao", categorical=catFeatureIndices, init="Cao", n_init=nInit)
    elbowHuangPlot = ut.cluster.elbowMethod(prototypeDf, numClusters, KPrototypes, "Huang", categorical=catFeatureIndices, init="Huang", n_init=nInit)
    elbowRandomPlot = ut.cluster.elbowMethod(prototypeDf, numClusters, KPrototypes, "random", categorical=catFeatureIndices, init="random", n_init=nInit)
    
    elbowPlots = (elbowCaoPlot * elbowHuangPlot * elbowRandomPlot).opts(width=800, height=400, xlabel="Number of cluster", 
                                                                        ylabel="WCSS", title="Elbow Method")
    
    # Save the elbow method plot
    if (saveFigure):
        elbowPlotFilePath = os.path.join("kprototype_elbowMethod.html")
        ut.cluster.hv.save(elbowPlots, elbowPlotFilePath, toolbar=True)
        print(f"Saved to {elbowPlotFilePath}")
    
    return elbowPlots

In [None]:
@widgets.interact_manual
def kprototype(prototypeDf=widgets.fixed(df[["OCCUR_MONTH", "OCCUR_HOUR", "PRECINCT", "WEEKDAY", "Latitude", "Longitude"]]),
               numClusters=(1,10,1), 
               init=["Cao", "Huang", "random"], 
               nInit=(1,20,1),
               verbose=(0,3,1)):
    ''' '''
    kproto = KPrototypes(n_clusters=numClusters, init=init, n_init=nInit, verbose=verbose)

    catFeatureIndices = list(map(lambda x: prototypeDf.columns.get_loc(x), prototypeDf.select_dtypes(include=["object", "bool", "category"]).columns))
    clusters = kproto.fit_predict(prototypeDf, categorical=catFeatureIndices)
    
    # Plot centroid coordinates
    m = ut.geo.plotMarker(*kproto.cluster_centroids_[0][0], draggable=False, title="0")
    for i, (lat, lon) in enumerate(kproto.cluster_centroids_[0][1:]):
        ut.geo.plotMarker(lat, lon, m, draggable=False, title=f"{i}")
        
#     for i in range(numClusters):
#         locations = [tuple(r) for _, r in prototypeDf[clusters == i][["Latitude", "Longitude"]].iterrows()]
#         m = ut.geo.plotHeatmap(locations, f"Heatmap - Cluster {i}", m,
#                                gradient={1.0: colors[i]}, min_opacity=1)
    display(m)
    
    # Create centroids df
    clusterDf = ut.cluster.createCentroidDf(kproto.cluster_centroids_[1], prototypeDf.iloc[:, catFeatureIndices].columns, clusters)
    display(clusterDf)