In [22]:
from collections import defaultdict, Counter
from geopy.distance import vincenty
from itertools import  product, groupby, combinations
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import json
from time import time
import sys, traceback
import math

# Data

In [2]:
groups = {'hindu_temple':'religious_centers',
    'mosque':'religious_centers',
    'place_of_worship' : 'religious_centers',
    'synagogue' : 'religious_centers',
    'church' : 'religious_centers',
    'meal_delivery' : 'restaurant',
    'food' : 'restaurant',
    'meal_takeaway' : 'restaurant',
    'bank' : 'finance',
    'roofing_contractor' : 'construction_contractor',
    'electrician' : 'construction_contractor',
    'plumber' : 'construction_contractor',
    'painter' : 'construction_contractor',
    'general_contractor' : 'construction_contractor',
    'health' : 'doctor',
    'lodging' : 'hotel_and_lodging'
}

In [3]:
with open('chicago/amenities_list.json') as outfile:
    amenitiesOldInfo = json.load(outfile)
    
amenitiesList = [amenity['Label'] for amenity in amenitiesOldInfo]
amenitiesIndices = {v: k for k, v in zip(range(len(amenitiesList)), amenitiesList)}
amenities = amenitiesList

In [6]:
import os
cities = {}
for file in os.listdir('Boston/revised/data/processing_data/result/'):
    try:
        data = pd.read_csv('Boston/revised/data/processing_data/result/' + file)
        data.columns = ['latitude','longitude','intensity','type','clusterId']

        for old, new in groups.items():
            data.loc[data.type == old,'type'] = new
        filteredData = data[data.type.isin(amenitiesList)]
        filteredData['id'] = filteredData.index
        filteredData.drop('intensity', axis = 1, inplace = True)
        cities[file.split('.')[0]] = filteredData
    except Exception as e:
        print(e)

# Mining

In [8]:
def initializeContainers(minLat, minLng, maxLat, maxLng):
    cityCoords = []
    length = vincenty((minLat, minLng), (maxLat, minLng)).kilometers
    width = vincenty((minLat, minLng), (minLat, maxLng)).kilometers
    cellSize = 2.0
    
    lngEps = (maxLat - minLat)/length*0.05
    latEps = (maxLng - minLng)/width*0.05

    latCells = int(math.ceil(length/cellSize) + 1)
    lngCells = int(math.ceil(width/cellSize) + 1)

    latStep = (maxLat - minLat)/latCells
    lngStep = (maxLng - minLng)/lngCells

    for lat in range(latCells):
        for lng in range(lngCells):
            cityCoords.append(((minLat + lat*latStep), (minLng + lng*lngStep)))
    cityCoords = np.array(cityCoords).reshape([latCells,lngCells, 2])
    hashTable = np.zeros([latCells, lngCells, len(amenitiesList)], dtype=object)
    return cityCoords, hashTable, lngEps, latEps

In [12]:
def hashData(hashTable, data, minLat, minLng, latCells, lngCells, latStep, lngStep, latEps, lngEps):
    start = time()
    for index, location in data.iterrows():
        lat, lng = location.latitude, location.longitude
        currentLatId, currentLngId = int((lat-minLat)/latStep), int((lng-minLng)/lngStep)

        upperLatId, bottomLatId = int((lat+latEps-minLat)/latStep), int((lat-latEps-minLat)/latStep)
        upperLngId, bottomLngId = int((lng+lngEps-minLng)/lngStep), int((lng-lngEps-minLng)/lngStep)
        otherLatId = upperLatId if upperLatId!=currentLatId else bottomLatId if bottomLatId!=currentLatId else currentLatId
        otherLatId = min(otherLatId, latCells-1)
        otherLngId = upperLngId if upperLngId!=currentLngId else bottomLngId if bottomLngId!=currentLngId else currentLngId
        otherLngId = min(otherLngId, lngCells-1)

        for latId in set([currentLatId, otherLatId]):
            for lngId in set([currentLngId, otherLngId]):
                try:
                    if hashTable[latId, lngId, amenitiesIndices[location.type]]:
                        hashTable[latId, lngId, amenitiesIndices[location.type]].append((lat, lng, location.id))
                        hashTable[latId, lngId, amenitiesIndices[location.type]].sort(key = lambda x : x[0])
                    else:
                        hashTable[latId, lngId, amenitiesIndices[location.type]] = [(lat, lng, location.id)]
                except Exception as e:
#                     print e
                    pass

    print('Super fast hasing took', time()-start, 'to execute')
    return hashTable, time()-start

In [7]:
def minePatterns(hashTable, cellsCoords, latCells, lngCells, latStep, lngStep, lngEps):
    neighborsByAmenity = {}
    start = time()
    for lat in range(latCells):
        for lng in range(lngCells):
            currentCell = hashTable[lat, lng]
            for amenityType in amenitiesList:
                amenityNeighbors = []
                if amenityType not in neighborsByAmenity:
                    neighborsByAmenity[amenityType] = []
                #No locations of this type
                if not currentCell[amenitiesIndices[amenityType]]:
                    continue

                cellCoords = cellsCoords[lat, lng]
                #Iterate all the objects in this cell and type

                for location in filter(lambda x: 
                                           x[0] >= cellCoords[0] 
                                       and x[0] < cellCoords[0] + latStep
                                       and x[1] >= cellCoords[1]
                                       and x[1] < cellCoords[1] + lngStep,
                                       currentCell[amenitiesIndices[amenityType]]):
                    locationNeighbors = []
                    for neighborAmenityType in filter(lambda x: currentCell[amenitiesIndices[x]]
                                                      and x != amenityType,
                                                      amenitiesList[amenitiesIndices[amenityType]+1:]):
                        for neighborLocation in currentCell[amenitiesIndices[neighborAmenityType]]:
                            if neighborLocation[0] -  location[0] < -lngEps:
                                continue
                            if location[0] - neighborLocation[0] > lngEps:
                                break
                            if vincenty(location[:2], neighborLocation[:2]).meters <= 50:
                                locationNeighbors.append((neighborLocation[2],
                                                         neighborAmenityType))
                    if len(locationNeighbors):
                        amenityNeighbors.append(((location[2],
                                                 amenityType), locationNeighbors))
                neighborsByAmenity[amenityType].extend(amenityNeighbors)
    return time()-start, neighborsByAmenity

In [17]:
def mineCityStarPatterns(cityName):
    data = cities[cityName]
    data = data.drop_duplicates(['latitude','longitude','type'])
    print(cityName)
    print(len(data))
    
    start = time()
    
    minLat, minLng, maxLat, maxLng =  data.latitude.min(), data.longitude.min(), data.latitude.max(), data.longitude.max()
    lowerLeft = minLat, minLng
    upperRight = maxLat, maxLng

    cellsCoords, hashTable, lngEps, latEps = initializeContainers(minLat, minLng, maxLat, maxLng)

    latCells, lngCells, _ = cellsCoords.shape
    latStep = (maxLat - minLat)/latCells
    lngStep = (maxLng - minLng)/lngCells

    hashTable, hasingTime = hashData(hashTable, data, minLat, minLng, latCells, lngCells, latStep, lngStep, latEps, lngEps)
    materTime, neighborsByAmenity = minePatterns(hashTable, cellsCoords, latCells, lngCells, latStep, lngStep, lngEps)
    
    neighborsByAmenity = {amenity:[(set([location[1] for location in instance[1]]), instance)
                                    for instance in neighborsByAmenity[amenity]]
                                    for amenity in amenities}
    
    print('Mining algorithm took', time() - start, 'to execute')
    return cityName, len(data), neighborsByAmenity, time() - start

In [18]:
city, dataLen, neighborsByAmenity, materTime = mineCityStarPatterns('Providence')

amenitiesCounts = defaultdict(lambda: 0)
for amenity in amenities:
    amenitiesCounts[amenity] = len(data[data.type == amenity])

Providence
8104
Super fast hasing took 1.3787920475006104 to execute
Mining algorithm took 35.83417296409607 to execute


In [21]:
th = 0.05

def generateCandidates(colocations, amenities, patternsPrevs, k0):
    candidates = []
    for colocation in tqdm(colocations):
        for amenity in amenities[amenitiesIndex[colocation.split('.')[-1]] + 1:]:
            notPrevalent = False
            for subPattern in combinations(colocation.split('.') + [amenity], k0):
                subP = '.'.join(subPattern)
                if not subP in colocations:
                    notPrevalent = True
                    break
                if k0 > 1:
                    if patternsPrevs[k0][subP] < th:
                        notPrevalent = True
                        break
            if notPrevalent:
                continue
            candidates.append(colocation + '.' +amenity)
    return candidates

def genSecondPatterns(colocationSplit, instances):
    cliqueInstanes = []
    neighborType = colocationSplit[1]
    for starInstance in instances:
        center = starInstance[0]
        for location in starInstance[1]:
            if location[1] == neighborType:
                cliqueInstanes.append((center[0], location[0]))
    return cliqueInstanes

def genMorePatterns(colocationSplit, instances, patternsInstances):
    cliqueInstances = []
    for instance in instances:
        center = tuple([instance[0][0]])
        neighbors = instance[1]
        potentialStarNeighbors = [location for location in neighbors
                                  if location[1] in colocationSplit[1:]]
        grouping = [[location[0] for location in group]
                    for key, group in groupby(potentialStarNeighbors, lambda x: x[1])]
        referenceStars = patternsInstances[k-1]['.'.join(colocationSplit[1:])]
        cliqueInstances.extend([(center) + star for star in product(*grouping) if star in referenceStars])
    return cliqueInstances

def checkStar(instanceTypes, colocation):
    for amenity in colocation[1:]:
        if not amenity in instanceTypes:
            return False
    return True

def calculcatePatternPrevalence(colocationSplit, instances, amenitiesCounts):
    patternInstanceCounts = {}
    for instance in instances:
        for instanceID, instanceType in zip(instance, colocationSplit):
            patternInstanceCounts[(instanceID, instanceType)] = 1
    counts = Counter([amenity for (id, amenity), one in patternInstanceCounts.items()])
    ratios = [float(count)/amenitiesCounts[type] for type, count in counts.items()]
    return min(ratios)

In [24]:
city, dataLen, neighborsByAmenity, materTime = mineCityStarPatterns('Providence')
amenitiesCounts = defaultdict(lambda: 0)
for amenity in amenities:
    amenitiesCounts[amenity] = len(data[data.type == amenity])

start = time()
patterns = {1: [amenity for amenity in amenities]}
patternsInstances = {}
patternsPrevs = {1: {amenity:1 for amenity in amenities}}

k = 2
while len(patterns[k-1]):

    print('Length:', k)
    print('Generating')
    candidates = generateCandidates(patterns[k-1], amenities, patternsPrevs, k-1)
    if not len(candidates):
        break
    print('Candiates: ', len(candidates))
    starInstances = {}
    print('Getting Stars')
    for colocation in tqdm(candidates):
        colocationSplit = colocation.split('.')
        starInstances[colocation] = (colocationSplit, [instance for types, instance in neighborsByAmenity[colocationSplit[0]]
                                               if checkStar(types, colocationSplit)])

    cliques = {}
    patternsInstances[k] = {}
    patternsPrevs[k] = {}
    print('Checking Stars')
    for colocation, (colocationSplit, instances) in tqdm(starInstances.items()):
        if len(instances):
            if k == 2:
                cliqueInstances = genSecondPatterns(colocationSplit, instances)
                patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                patternsPrevs[k][colocation] = patternPrev
                if patternPrev >= th:
                    patternsInstances[k][colocation] = cliqueInstances

            else:
                cliqueInstances = genMorePatterns(colocationSplit, instances, patternsInstances)
                if len(cliqueInstances):
                    patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                    patternsPrevs[k][colocation] = patternPrev
                    if patternPrev >= th:
                        patternsInstances[k][colocation] = cliqueInstances
    patterns[k] = {clique:len(instances) for clique, instances in patternsInstances[k].items()}
    print('Patterns: ', len(patterns[k]))
    print('')
    k += 1

miningTime = time()-start
print('Mining algorithm took',miningTime , 'to execute')

Providence
8104
Super fast hasing took 1.3688080310821533 to execute
Mining algorithm took 37.01934003829956 to execute
Length: 2
Generating


100%|██████████| 74/74 [00:00<00:00, 9884.04it/s]


Candiates:  2701
Getting Stars


100%|██████████| 2701/2701 [00:00<00:00, 21495.17it/s]


Checking Stars


100%|██████████| 2701/2701 [00:00<00:00, 41530.20it/s]


Patterns:  44

Length: 3
Generating


100%|██████████| 44/44 [00:00<00:00, 19927.59it/s]


Candiates:  16
Getting Stars


100%|██████████| 16/16 [00:00<00:00, 5718.21it/s]


Checking Stars


100%|██████████| 16/16 [00:00<00:00, 420.29it/s]


Patterns:  0

Mining algorithm took 0.2683429718017578 to execute


In [None]:
allResults = []
for city, data in cities.items():
    try:
        city, dataLen, neighborsByAmenity, materTime = mineCityStarPatterns(city)
        amenitiesCounts = defaultdict(lambda: 0)
        for amenity in amenities:
            amenitiesCounts[amenity] = len(data[data.type == amenity])
        
        start = time()
        patterns = {1: [amenity for amenity in amenities]}
        patternsInstances = {}
        patternsPrevs = {1: {amenity:1 for amenity in amenities}}
        
        k = 2
        while len(patterns[k-1]):

            print('Length:', k)
            print('Generating')
            candidates = generateCandidates(patterns[k-1], amenities, patternsPrevs, k-1)
            if not len(candidates):
                break
            print('Candiates: ', len(candidates))
            starInstances = {}
            print('Getting Stars')
            for colocation in tqdm(candidates):
                colocationSplit = colocation.split('.')
                starInstances[colocation] = (colocationSplit, [instance for types, instance in neighborsByAmenity[colocationSplit[0]]
                                                       if checkStar(types, colocationSplit)])

            cliques = {}
            patternsInstances[k] = {}
            patternsPrevs[k] = {}
            print('Checking Stars')
            for colocation, (colocationSplit, instances) in tqdm(starInstances.items()):
                if len(instances):
                    if k == 2:
                        cliqueInstances = genSecondPatterns(colocationSplit, instances)
                        patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                        patternsPrevs[k][colocation] = patternPrev
                        if patternPrev >= th:
                            patternsInstances[k][colocation] = cliqueInstances

                    else:
                        cliqueInstances = genMorePatterns(colocationSplit, instances, patternsInstances)
                        if len(cliqueInstances):
                            patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                            patternsPrevs[k][colocation] = patternPrev
                            if patternPrev >= th:
                                patternsInstances[k][colocation] = cliqueInstances
            patterns[k] = {clique:len(instances) for clique, instances in patternsInstances[k].items()}
            print('Patterns: ', len(patterns[k]))
            print('')
            k += 1
    
        miningTime = time()-start
        print('Mining algorithm took',miningTime , 'to execute')
        allResults.append([city, dataLen, materTime, miningTime, materTime+miningTime, patterns, patternsInstances, patternsPrevs])
        break
    except Exception as e:
        print(e)

In [31]:
with open('Patterns/patternsAll0-05.json') as outfile:
    resultsHuge = json.load(outfile)

In [25]:
#Analysis

In [32]:
len(resultsHuge)

37

In [35]:
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

In [41]:
scatter = go.Scatter(
    x = [res[1]/1000.0 for res in resultsHuge],
    y = [res[2] for res in resultsHuge],
    mode = 'markers+text',
    text = [res[0] for res in resultsHuge]
)
layout = go.Layout(
    title='Materialization time',
    hovermode='closest',
    xaxis=dict(
        title='Locations, Thousands',
    ),
    yaxis=dict(
        title='Time, Seconds',
    ),
)
fig = go.Figure(data = [scatter], layout = layout)
iplot(fig)

In [42]:
scatter = go.Scatter(
    x = [res[1]/1000.0 for res in resultsHuge],
    y = [res[3] for res in resultsHuge],
    mode = 'markers+text',
    text = [res[0] for res in resultsHuge]
)
layout = go.Layout(
    title='Mining time',
    hovermode='closest',
    xaxis=dict(
        title='Locations, Thousands',
    ),
    yaxis=dict(
        title='Time, Seconds',
    ),
)
fig = go.Figure(data = [scatter], layout = layout)
iplot(fig)

# Recomendation

In [49]:
city = 'Providence'
patterns = [cityResult[5] for cityResult in resultsHuge if cityResult[0]==city][0]
patternsInstances = [cityResult[6] for cityResult in resultsHuge if cityResult[0]==city][0]
patternsPrevs = [cityResult[7] for cityResult in resultsHuge if cityResult[0]==city][0]
cliques = pd.DataFrame()
instanceId = 0
ids = []

for size, patternss in patternsInstances.items():
    cliquesSize = pd.DataFrame()
    for pattern, instances in tqdm(patternss.items()):
        cliquesPattern = pd.DataFrame()
        for instance in instances:
            instanceId += 1
            clique = data[data.id.isin(instance)]
            ids.extend([instanceId]*len(clique))
            cliquesPattern = cliquesPattern.append(clique)
            if os.path.exists('stopIt.json'):
                break
        if os.path.exists('stopIt.json'):
                break
        cliquesPattern['pattern'] = pattern
        cliquesSize = cliquesSize.append(cliquesPattern)
    cliquesSize['size'] = size
    cliques = cliques.append(cliquesSize)

cliques['instanceId'] = ids

100%|██████████| 98/98 [00:37<00:00,  2.82it/s]
100%|██████████| 7/7 [00:07<00:00,  1.18s/it]
100%|██████████| 380/380 [00:43<00:00,  6.50it/s]


In [53]:
def getPotentialPatterns(amenity, patterns, patternsPrevs):
    possiblePatterns = {}
    for size, patternTypes in patterns.items():
        if int(size) > 2:
            possiblePatterns.update({'.'.join(pattern.split('.')[1:]): patternsPrevs[size][pattern]
                                     for pattern in patternTypes if pattern.startswith(amenity)})
    return possiblePatterns

def scorePotentialPattens(potantialPatterns, clusterCliques):
    potentialPatternsInCluster = pd.DataFrame()
    scores = []
    for pattern, prevalence in potantialPatterns.items():
        print(pattern)
        patternsInCluster = clusterCliques.loc[clusterCliques.pattern == pattern]
        patternsCount = len(patternsInCluster.groupby('instanceId'))
        if patternsCount:
            scores.extend([prevalence/patternsCount]*len(patternsInCluster))
            potentialPatternsInCluster = potentialPatternsInCluster.append(patternsInCluster)
    potentialPatternsInCluster['score'] = scores
    return potentialPatternsInCluster

In [65]:
amenity = 'cafe'
clusterId = 62
clusterCliques = cliques[cliques.clusterId == clusterId]
potantialPatterns = getPotentialPatterns(amenity, patterns, patternsPrevs)
potantialPatternsInCluster = scorePotentialPattens(potantialPatterns, clusterCliques).sort_values(by='score', ascending=False)

clothing_store.shoe_store
finance.restaurant
finance.lawyer.real_estate_agency
clothing_store.electronics_store
finance.real_estate_agency
clothing_store.restaurant
lawyer.real_estate_agency
clothing_store.jewelry_store
finance.lawyer
