In [1]:
import pickle
import time
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def fuzzy_latlon(df, epsilon=0.0001):
    df_new = df.sort_values(by=['lat','lon'])
    df_new = df_new.reset_index().drop(["index"], axis = 1)
    g_id = 0
    df_new['g_id'] = -1
    j = 0
    for i in range(len(df) - 1):
        if i >= j:
            old = df_new.iloc[i]['lat_lon']
            df_new.at[i, 'g_id'] = g_id
            j = i + 1
            while similar(df_new.iloc[j]['lat_lon'],old,epsilon):
                df_new.at[j, 'g_id'] = g_id
                j += 1
            g_id += 1
    return df_new

def similar(t1, t2, epsilon):
    diff1 = abs(t1[0] - t2[0])
    diff2 = abs(t1[1] - t2[1])
    if (diff1 <= epsilon) and (diff2 <= epsilon):
        return True
    else:
        return False

In [2]:
infile = "ca_reviews_with_norm_id.p"
df = pd.read_pickle(infile)

In [4]:
df.columns

Index(['address', 'name', 'range_days', 'review', 'rubmaps', 'yelp', 'lat_lon',
       'HT_bi_0_count', 'HT_bi_1_count', 'HT_bi_2_count', 'HT_tri_0_count',
       'HT_tri_1_count', 'HT_tri_2_count', 'HT_uni_0_count', 'HT_uni_1_count',
       'HT_uni_2_count', 'sex_bi_0_count', 'sex_bi_1_count', 'sex_bi_2_count',
       'sex_tri_0_count', 'sex_tri_1_count', 'sex_tri_2_count',
       'sex_uni_0_count', 'sex_uni_1_count', 'sex_uni_2_count', 'g_id', 'lat',
       'lon', 'norm_id'],
      dtype='object')

In [11]:
grouped = df.groupby(['norm_id']).agg({"yelp": "sum",
                              "rubmaps":"sum", "name": lambda x: list(set(x)),
                                 "address":lambda x: list(set(x)),
                                 'HT_bi_0_count':"max",
   'HT_bi_1_count':"max", 'HT_bi_2_count':"max", 'HT_tri_0_count':"max", 'HT_tri_1_count':"max",
   'HT_tri_2_count':"max", 'HT_uni_0_count':"max", 'HT_uni_1_count':"max", 'HT_uni_2_count':"max",
   'sex_bi_0_count':"max", 'sex_bi_1_count':"max", 'sex_bi_2_count':"max", 'sex_tri_0_count':"max",
   'sex_tri_1_count':"max", 'sex_tri_2_count':"max", 'sex_uni_0_count':"max",
   'sex_uni_1_count':"max", 'sex_uni_2_count':"max"})
print('grouped')
grouped['num_names'] = grouped['name'].apply(lambda x: len(x))
grouped['num_addresses'] = grouped['address'].apply(lambda x: len(x))

X = grouped.loc[:, ['yelp', 'rubmaps', 'HT_bi_0_count',
   'HT_bi_1_count', 'HT_bi_2_count', 'HT_tri_0_count', 'HT_tri_1_count',
   'HT_tri_2_count', 'HT_uni_0_count', 'HT_uni_1_count', 'HT_uni_2_count',
   'sex_bi_0_count', 'sex_bi_1_count', 'sex_bi_2_count', 'sex_tri_0_count',
   'sex_tri_1_count', 'sex_tri_2_count', 'sex_uni_0_count',
   'sex_uni_1_count', 'sex_uni_2_count']]
scaler = StandardScaler()
X = scaler.fit_transform(X)
print('standardized')
cl = KMeans(n_clusters=2).fit(X)
grouped['cluster'] = cl.labels_
grouped = grouped.drop(['address', 'name', 'rubmaps', 'yelp',
       'HT_bi_0_count', 'HT_bi_1_count', 'HT_bi_2_count', 'HT_tri_0_count',
       'HT_tri_1_count', 'HT_tri_2_count', 'HT_uni_0_count', 'HT_uni_1_count',
       'HT_uni_2_count', 'sex_bi_0_count', 'sex_bi_1_count', 'sex_bi_2_count',
       'sex_tri_0_count', 'sex_tri_1_count', 'sex_tri_2_count',
       'sex_uni_0_count', 'sex_uni_1_count', 'sex_uni_2_count', 'num_names','num_addresses'], axis = 1)

grouped
standardized


In [15]:
df_with_clusters = df.merge(grouped, on='norm_id').drop(['g_id', 'lat_lon'], axis=1)

In [16]:
df_with_clusters.to_pickle('ca_features_1.p')

In [9]:
grouped[grouped['cluster'] == 1].describe()

Unnamed: 0,yelp,rubmaps,HT_bi_0_count,HT_bi_1_count,HT_bi_2_count,HT_tri_0_count,HT_tri_1_count,HT_tri_2_count,HT_uni_0_count,HT_uni_1_count,...,sex_bi_2_count,sex_tri_0_count,sex_tri_1_count,sex_tri_2_count,sex_uni_0_count,sex_uni_1_count,sex_uni_2_count,num_names,num_addresses,cluster
count,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,...,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0,2277.0
mean,2.208169,40.942029,4.874835,0.208608,0.145806,1.635485,0.002635,0.002635,29.657444,2.90953,...,1.216513,0.522178,0.598595,0.43083,21.070707,9.346509,6.496267,1.106719,1.103645,1.0
std,8.022194,31.586091,1.885255,0.432587,0.377062,1.037405,0.051276,0.051276,12.140051,1.460879,...,1.240475,0.614746,0.730071,0.852051,8.412363,4.879101,3.567773,0.43114,0.375864,0.0
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,20.0,4.0,0.0,0.0,1.0,0.0,0.0,21.0,2.0,...,0.0,0.0,0.0,0.0,15.0,6.0,4.0,1.0,1.0,1.0
50%,0.0,35.0,4.0,0.0,0.0,1.0,0.0,0.0,27.0,3.0,...,1.0,0.0,0.0,0.0,19.0,8.0,6.0,1.0,1.0,1.0
75%,0.0,56.0,6.0,0.0,0.0,2.0,0.0,0.0,35.0,4.0,...,2.0,1.0,1.0,1.0,25.0,11.0,8.0,1.0,1.0,1.0
max,120.0,358.0,17.0,4.0,3.0,10.0,1.0,1.0,110.0,17.0,...,12.0,4.0,5.0,9.0,79.0,55.0,28.0,7.0,8.0,1.0
