In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
import seaborn as sns


gun_data_with_zipcode = pd.read_csv("GunData_withZipcode")
income_data = pd.read_csv("income_database.csv", encoding='latin1')

print(gun_data_with_zipcode.shape)
gun_data_with_zipcode.drop_duplicates(subset=['incident_id'], keep=False, inplace=True)
print(gun_data_with_zipcode.shape)
income_data.rename(columns={'Zip_Code': 'Zipcode'}, inplace=True)

In [None]:
print(income_data.shape)
income_data.drop_duplicates(subset=['Zipcode'], keep= "last", inplace=True)
print(income_data.shape)

result = pd.merge(gun_data_with_zipcode,income_data,on='Zipcode',how='left')
result = result[np.isfinite(result['sum_w'])]
#joined_data.to_csv("Joined_dataset.csv")
sorted_on_date = result.sort_values(by = "date", ascending= False)
sorted_on_date['date'] = pd.to_datetime(sorted_on_date['date'])  

In [None]:
import numpy as np

def find_distance(a,b):
    sum_v = 0.0
    for i in range(len(a)):
        sum_v += ((a[i]-b[i])*(a[i]-b[i]))
    return np.sqrt(sum_v)


def hierarchical(samples,k):
    distance_matrix = np.zeros(samples, samples)
    heap_list = []
    check_vals = samples[:]
    while len(check_vals[0]) > k:
        min_d = 10000
        i_val = 0
        j_val = 0
        for i in range(len(distance_matrix)):
            for j in range(i+1, len(distance_matrix[0])):
                distance_matrix[i][j] = find_distance(samples[i], samples[j])
                if distance_matrix[i][j] < min_d:
                    min_d = distance_matrix[i][j]
                    i_val = i
                    j_val = j
        samples[i_val] = samples[j_val] = (samples[i_val] + samples[j_val])/2
        check_vals.remove(j_val)
        distance_matrix = np.zeros(len(check_vals), len(check_vals))
    return check_vals



def cure_clustering(centroids, input_data):
    alpha = 0.2

    for i in range(len(centroids)):
        max_dist = 0
        rep1_1_p = 0
        rep1_2_p = 0
        rep1_3_p = 0
        center = centroids[i][1][:]

        for j in centroids[i][0]:
            dist = find_distance(input_data[j], center)
            if dist > max_dist:
                max_dist = dist
                rep1_1_p = j
        max_dist = 0

        for k in centroids[i][0]:
            if k != rep1_1_p:
                dist = find_distance(input_data[k][0], input_data[rep1_1_p][0])
                if dist > max_dist:
                    max_dist = dist
                    rep1_2_p = k

        for k in centroids[i][0]:
            if k != rep1_1_p and k != rep1_2_p:
                dist = find_distance(input_data[k][0], input_data[rep1_2_p][0])
                if dist > max_dist:
                    max_dist = dist
                    rep1_3_p = k
        centroids[i].extend([[input_data[rep1_1_p][:], rep1_1_p], [input_data[rep1_2_p][:], rep1_2_p], [input_data[rep1_3_p][:], rep1_3_p]])

    centerVals = []
    clusterPoints = []
    repsEuclidiean = []

    counter = 1
    for i in centroids:
        centerVals.append([counter, i[1]])
        clusterPoints.append([counter, i[0]])
        repsEuclidiean.append([counter, i[2:]])
        counter += 1

    for cluster_iterator in range(len(centerVals)):
        for repno in range(len(repsEuclidiean[cluster_iterator][1])):
            for dimention in range(3):
                distToCenter = np.sqrt((repsEuclidiean[cluster_iterator][1][repno][0][dimention] -
                                        centerVals[cluster_iterator][1][dimention]) ** 2)
                moveDist = distToCenter * alpha
                if repsEuclidiean[cluster_iterator][1][repno][0][dimention] < centerVals[cluster_iterator][1][dimention]:
                    repsEuclidiean[cluster_iterator][1][repno][0][dimention] += moveDist
                elif repsEuclidiean[cluster_iterator][1][repno][0][dimention] > \
                        centerVals[cluster_iterator][1][dimention]:
                    repsEuclidiean[cluster_iterator][1][repno][0][dimention] -= moveDist

    return repsEuclidiean


def generate_samples(input_data):
    import random
    randoms = set()
    for i in range(len(input_data)/100):
        randoms.add(random.randrange(1, len(input_data), 1))
    samples = []
    for i in range(len(input_data)):
        if i in random:
            samples.append(input_data[i])
    return samples


def cure(clusters,input_data):
    samples = generate_samples(input_data)
    centroids = hierarchical(samples, input_data, clusters)
    return cure_clustering(centroids, input_data)


In [None]:
X=result.loc[:,['latitude','longitude']]
id_n= 10
cure_pts = cure(id_n, X)
id_label=cure_pts[2]

In [None]:
#plot result
ptsymb = np.array(['b.','r.','m.','g.','c.','k.','b*','r*','m*','r^']);
plt.figure(figsize=(12,12))
plt.ylabel('Longitude', fontsize=1)
plt.xlabel('Latitude', fontsize=1)
for i in range(id_n):
    cluster=np.where(id_label==i)[0]
    plt.plot(X.latitude[cluster].values,X.longitude[cluster].values,ptsymb[i])
plt.show()


In [None]:
#plot result
ptsymb = np.array(['b.','r.','m.','g.','c.','k.','b*','r*','m*','r^']);
plt.figure(figsize=(12,12))
plt.ylabel('Longitude', fontsize=1)
plt.xlabel('Latitude', fontsize=1)
for i in range(id_n):
    cluster=np.where(id_label==i)[0]
    plt.plot(X.latitude[cluster].values,X.longitude[cluster].values,ptsymb[i])
plt.show()

In [None]:
X=result.loc[:,['latitude','longitude', 'Median']]
id_n= 10
cure_pts = cure_pts = cure(id_n, X)
id_label = cure_pts[2]

In [None]:
ptsymb = np.array(['b.','r.','m.','g.','c.','k.','b*','r*','m*','r^']);
plt.figure(figsize=(12,12))
plt.ylabel('Longitude', fontsize=1)
plt.xlabel('Latitude', fontsize=1)
for i in range(id_n):
    cluster=np.where(id_label==i)[0]
    plt.plot(X.latitude[cluster].values,X.longitude[cluster].values,ptsymb[i])
plt.show()