In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.cluster import DBSCAN
import random
from kmodes.kmodes import KModes

In [2]:
# IMPORT DATA

df = pd.read_json('yelp_dataset/yelp_academic_dataset_business.json', lines=True)
print(df)

                   business_id                     name               address  \
0       6iYb2HFDywm3zjuRg0shjw      Oskar Blues Taproom          921 Pearl St   
1       tCbdrRPZA0oiIYSmHG3J0w  Flying Elephants at PDX   7000 NE Airport Way   
2       bvN78flM8NLprQ1a1y5dRg           The Reclaimory    4720 Hawthorne Ave   
3       oaepsyvc0J17qwi8cfrOWg              Great Clips    2566 Enterprise Rd   
4       PE9uqAjdw0E4-8mjGl3wVA        Crossfit Terminus   1046 Memorial Dr SE   
...                        ...                      ...                   ...   
160580  D2mHoIDXx9N8mS1pGoKV9Q       Eleven Oaks Realty                         
160581  bQX-kwVTyZgcdZGEPzce6Q             Webb's Honey    21777 State Rd 520   
160582  wvFZ06nmPmQ2-IVoPqVYLA    Painting with a Twist  2164 S Chickasaw Trl   
160583  GB75wPibj3IjNauaoCxyGA              Havana Cafe        910 NW 14th St   
160584  ngmLL5Y5OT-bYHKU0kKrYA              Zora Grille   1370 E Altamonte Dr   

                     city s

In [3]:
# CLEAN DATA

n = 10
print(df['state'].value_counts()[:n].index.tolist())


df_filtered = df[(df['state']=='MA')]
df_cluster = df_filtered.filter(items=['business_id','latitude','longitude','categories'])
df_filtered = df_filtered.filter(items=['business_id','name','address','city','state','stars','review_count','is_open','attributes','categories'])

print(df_filtered)
print(df_filtered.shape[0])




['MA', 'OR', 'TX', 'FL', 'GA', 'BC', 'OH', 'CO', 'WA', 'CA']
                   business_id                                 name  \
11      hCABMnKtwo4Y9alQDxh2kw         Star Kreations Salon and Spa   
12      HPA_qyMEddpAEtFof02ixg                  Mr G's Pizza & Subs   
18      6fT0lYr_UgWSCZs_w1PBTQ                        Salter School   
26      hcRxdDg7DYryCxCoI8ySQA                    Longwood Galleria   
29      jGennaZUr2MsJyRhijNBfA                      Legal Sea Foods   
...                        ...                                  ...   
160556  87f7kR7nTz8WHnmtLM_S6w                                 O Ya   
160563  yQL8SrSETbbCI1U5esVJQw                  Ciao! Pizza & Pasta   
160568  hX-wc8LoHWSWjBwZ84ANcg          Chestnut Green Dental Group   
160569  _-nynGfhsMVVWWbAZ6YhTw                        The Jury Room   
160579  d6iEdtgLKelWKtb-2UmE2A  Master Finish Mobile Auto Detailing   

                       address     city state  stars  review_count  is_open  \
11     

## Cluster based on location

In [4]:
df_cluster

Unnamed: 0,business_id,latitude,longitude,categories
11,hCABMnKtwo4Y9alQDxh2kw,42.534248,-70.990948,"Wigs, Hair Extensions, Hair Salons, Blow Dry/O..."
12,HPA_qyMEddpAEtFof02ixg,42.541155,-70.973438,"Food, Pizza, Restaurants"
18,6fT0lYr_UgWSCZs_w1PBTQ,42.427889,-71.073475,"Specialty Schools, Massage Schools, Middle Sch..."
26,hcRxdDg7DYryCxCoI8ySQA,42.338544,-71.106842,"Restaurants, Shopping, Shopping Centers"
29,jGennaZUr2MsJyRhijNBfA,42.363442,-71.025781,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
...,...,...,...,...
160556,87f7kR7nTz8WHnmtLM_S6w,42.351408,-71.056867,"Japanese, Sushi Bars, Restaurants"
160563,yQL8SrSETbbCI1U5esVJQw,42.389221,-71.040882,"Restaurants, Pizza, Italian"
160568,hX-wc8LoHWSWjBwZ84ANcg,42.550054,-70.944788,"Health & Medical, Dentists, General Dentistry"
160569,_-nynGfhsMVVWWbAZ6YhTw,42.249118,-71.001071,"Restaurants, American (New), Nightlife"


In [5]:
df_location = df_cluster.filter(items= ['latitude','longitude'])
df_location

Unnamed: 0,latitude,longitude
11,42.534248,-70.990948
12,42.541155,-70.973438
18,42.427889,-71.073475
26,42.338544,-71.106842
29,42.363442,-71.025781
...,...,...
160556,42.351408,-71.056867
160563,42.389221,-71.040882
160568,42.550054,-70.944788
160569,42.249118,-71.001071


In [6]:
db = DBSCAN(eps=0.00005, min_samples=3, algorithm='ball_tree', metric='haversine').fit(np.radians(df_location))

In [7]:
labels = db.labels_

In [8]:
labels

array([ 0,  1,  2, ..., 19,  6, -1], dtype=int64)

In [9]:
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

In [10]:
-1 in labels

True

In [11]:
df_location.iloc(0)[2]['latitude']

42.427889

In [12]:
len(colors)

444

In [13]:
labels[1]
random.shuffle(colors)

Plotting for location clusters

In [None]:
plt.xlim(42,42.7)
plt.ylim(-71.6,-70.5)
for i in range(0,len(labels)):
    if(labels[i] == -1):
       c = [0,0,0,1]
    c = colors[labels[i]]
    x = df_location.iloc(0)[i]['latitude']
    y = df_location.iloc(0)[i]['longitude']
    plt.plot(x,y,'ro',color=c, markeredgecolor = 'k')

Generate list of dataframe, each containing a cluster

In [14]:
cluster_list = [None] * len(set(labels))
for i in range(0,len(labels)):
    index = labels[i]
    if index != -1:
        if cluster_list[index] == None:
            cluster_list[index] = []
            cluster_list[index].append(df_cluster.iloc(0)[i])
        else:
            cluster_list[index].append(df_cluster.iloc(0)[i])

In [15]:
for i in range(0, len(cluster_list)):
    df_new = pd.DataFrame(cluster_list[i])
    cluster_list[i] = df_new

t = np.array(cluster_list[0]['categories']).tolist()

for i in range(0,len(t)):
    text = t[i]
    t[i] = [x.strip() for x in text.split(',')]


unique_categories = list(set(i for j in t for i in j))
df_test = pd.DataFrame(index = range(0,len(t)), columns = unique_categories)

for i in range(0,len(t)):
    for each in t[i]:
        df_test.loc[i,each] = each
df_test.loc[0,'Wigs']
df_test.isnull().values.any()
df_test

while(df_test.isnull().values.any()==True):
    df_test.fillna(str(random.random() * 10000),inplace=True,limit=1)

km = KModes(n_clusters=50, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(df_test)

l = [None] * len(set(clusters))
for i in range(0,len(clusters)):
    index = clusters[i]
    if index != -1:
        if l[index] == None:
            l[index] = []
            l[index].append(cluster_list[0].iloc(0)[i])
        else:
            l[index].append(cluster_list[0].iloc(0)[i])


## Clustering for each location cluster

The result is expected to be a list, with each element as a list of clusters. 

In [None]:
res = []
for i in range(0,len(cluster_list)):
    t = np.array(cluster_list[i]['categories']).tolist()
    for j in range(0,len(t)):
        text = t[j]
        if(text == None):
            text = 'None'
        t[j] = [x.strip() for x in text.split(',')]
    unique_categories = list(set(k for j in t for k in j))
    df_test = pd.DataFrame(index = range(0,len(t)), columns = unique_categories)
    for j in range(0,len(t)):
        for each in t[j]:
            df_test.loc[j,each] = each
    while(df_test.isnull().values.any()==True):
        df_test.fillna(str(random.random() * 10000),inplace=True,limit=1)
    km = KModes(n_clusters=int(min(df_test.shape[0]/3,50)), init='Huang', n_init=5, verbose=1)
    clusters = km.fit_predict(df_test)
    l = [None] * len(set(clusters))
    for j in range(0,len(clusters)):
        index = clusters[j]
        if index != -1:
            if l[index] == None:
                l[index] = []
                l[index].append(cluster_list[i].iloc(0)[j])
            else:
                l[index].append(cluster_list[i].iloc(0)[j])
    res.append(l)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 2, cost: 19821.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 1, cost: 19822.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 19839.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 19839.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 19831.0
Best run was number 1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 247.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 245.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...

In [None]:
cluster_list[2]
