In [122]:
from collections import defaultdict, Counter
from geopy.distance import vincenty
from itertools import  product, groupby, combinations
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import json
from time import time
import sys, traceback
import math
import geopy as gp

# Data

In [2]:
groups = {'hindu_temple':'religious_centers',
    'mosque':'religious_centers',
    'place_of_worship' : 'religious_centers',
    'synagogue' : 'religious_centers',
    'church' : 'religious_centers',
    'meal_delivery' : 'restaurant',
    'food' : 'restaurant',
    'meal_takeaway' : 'restaurant',
    'bank' : 'finance',
    'roofing_contractor' : 'construction_contractor',
    'electrician' : 'construction_contractor',
    'plumber' : 'construction_contractor',
    'painter' : 'construction_contractor',
    'general_contractor' : 'construction_contractor',
    'health' : 'doctor',
    'lodging' : 'hotel_and_lodging'
}

In [3]:
with open('chicago/amenities_list.json') as outfile:
    amenitiesOldInfo = json.load(outfile)
    
amenitiesList = [amenity['Label'] for amenity in amenitiesOldInfo]
amenitiesIndices = {v: k for k, v in zip(range(len(amenitiesList)), amenitiesList)}
amenities = amenitiesList

In [261]:
import os
cities = {}
for file in os.listdir('Boston/revised/data/processing_data/result/'):
    try:
        data = pd.read_csv('Boston/revised/data/processing_data/result/' + file)
        data.columns = ['latitude','longitude','intensity','type','clusterId']

        for old, new in groups.items():
            data.loc[data.type == old,'type'] = new
        filteredData = data[data.type.isin(amenitiesList)]
        filteredData['id'] = filteredData.index
        filteredData.drop('intensity', axis = 1, inplace = True)
        filteredData = filteredData.drop_duplicates(['latitude','longitude','type'])
        cities[file.split('.')[0]] = filteredData
    except Exception as e:
        print(e)

In [265]:
sum([len(item) for item in cities.values()])

1077462

In [262]:
total = []
for city, data in cities.items():
    total+=[len(data)]

# Mining

In [8]:
def initializeContainers(minLat, minLng, maxLat, maxLng):
    cityCoords = []
    length = vincenty((minLat, minLng), (maxLat, minLng)).kilometers
    width = vincenty((minLat, minLng), (minLat, maxLng)).kilometers
    cellSize = 2.0
    
    lngEps = (maxLat - minLat)/length*0.05
    latEps = (maxLng - minLng)/width*0.05

    latCells = int(math.ceil(length/cellSize) + 1)
    lngCells = int(math.ceil(width/cellSize) + 1)

    latStep = (maxLat - minLat)/latCells
    lngStep = (maxLng - minLng)/lngCells

    for lat in range(latCells):
        for lng in range(lngCells):
            cityCoords.append(((minLat + lat*latStep), (minLng + lng*lngStep)))
    cityCoords = np.array(cityCoords).reshape([latCells,lngCells, 2])
    hashTable = np.zeros([latCells, lngCells, len(amenitiesList)], dtype=object)
    return cityCoords, hashTable, lngEps, latEps

In [161]:
def asRadians(degrees):
    return degrees * math.pi / 180


def getXYpos(relativeNullPoint, p):
    """ Calculates X and Y distances in meters.
    """
    deltaLatitude = p.latitude - relativeNullPoint.latitude
    deltaLongitude = p.longitude - relativeNullPoint.longitude
    latitudeCircumference = 40075160 * math.cos(asRadians(relativeNullPoint.latitude))
    resultX = deltaLongitude * latitudeCircumference / 360
    resultY = deltaLatitude * 40008000 / 360
    return resultX, resultY

In [162]:
def hashData(hashTable, data, minLat, minLng, latCells, lngCells, latStep, lngStep, latEps, lngEps):
    start = time()
    relPoint = gp.Point(minLat, minLng)
    print(relPoint)
    for index, location in data.iterrows():
        lat, lng = location.latitude, location.longitude
        currentLatId, currentLngId = int((lat-minLat)/latStep), int((lng-minLng)/lngStep)

        upperLatId, bottomLatId = int((lat+latEps-minLat)/latStep), int((lat-latEps-minLat)/latStep)
        upperLngId, bottomLngId = int((lng+lngEps-minLng)/lngStep), int((lng-lngEps-minLng)/lngStep)
        otherLatId = upperLatId if upperLatId!=currentLatId else bottomLatId if bottomLatId!=currentLatId else currentLatId
        otherLatId = min(otherLatId, latCells-1)
        otherLngId = upperLngId if upperLngId!=currentLngId else bottomLngId if bottomLngId!=currentLngId else currentLngId
        otherLngId = min(otherLngId, lngCells-1)

        for latId in set([currentLatId, otherLatId]):
            for lngId in set([currentLngId, otherLngId]):
                try:
                    x, y = getXYpos(relPoint, gp.Point(lat, lng))
                    if hashTable[latId, lngId, amenitiesIndices[location.type]]:
                        hashTable[latId, lngId, amenitiesIndices[location.type]].append((lat, lng, x, y, location.id))
                        hashTable[latId, lngId, amenitiesIndices[location.type]].sort(key = lambda x : x[0])
                    else:
                        hashTable[latId, lngId, amenitiesIndices[location.type]] = [(lat, lng, x, y, location.id)]
                except Exception as e:
                    print(e)

    print('Super fast hasing took', time()-start, 'to execute')
    return hashTable, time()-start

In [189]:
def minePatterns(hashTable, cellsCoords, latCells, lngCells, latStep, lngStep, lngEps):
    neighborsByAmenity = {}
    start = time()
    print(lngEps)
    for lat in range(latCells):
        for lng in range(lngCells):
            currentCell = hashTable[lat, lng]
            for amenityType in amenitiesList:
                amenityNeighbors = []
                if amenityType not in neighborsByAmenity:
                    neighborsByAmenity[amenityType] = []
                #No locations of this type
                if not currentCell[amenitiesIndices[amenityType]]:
                    continue

                cellCoords = cellsCoords[lat, lng]
                #Iterate all the objects in this cell and type
                for location in filter(lambda x: 
                                           x[0] >= cellCoords[0] 
                                       and x[0] < cellCoords[0] + latStep
                                       and x[1] >= cellCoords[1]
                                       and x[1] < cellCoords[1] + lngStep,
                                       currentCell[amenitiesIndices[amenityType]]):
#                     print(currentCell[amenitiesIndices[amenityType]])
                    locationNeighbors = []
                    for neighborAmenityType in filter(lambda x: currentCell[amenitiesIndices[x]]
                                                      and x != amenityType,
                                                      amenitiesList[amenitiesIndices[amenityType]+1:]):
                        for neighborLocation in currentCell[amenitiesIndices[neighborAmenityType]]:
                            if neighborLocation[2] -  location[2] < -50:
                                continue
                            if location[2] - neighborLocation[2] > 50:
                                break
                                
                            if dist(location, neighborLocation) <= 50:
                                locationNeighbors.append((neighborLocation[4],
                                                         neighborAmenityType))
                    if len(locationNeighbors):
                        amenityNeighbors.append(((location[4],
                                                 amenityType), locationNeighbors))
                neighborsByAmenity[amenityType].extend(amenityNeighbors)
    return time()-start, neighborsByAmenity

In [215]:
def dist(location, neighborLocation):
    return math.sqrt((location[2]-neighborLocation[2])**2 + (location[3]-neighborLocation[3])**2)
def dist2(location, neighborLocation):
    return math.sqrt((location[0]-neighborLocation[0])**2 + (location[1]-neighborLocation[1])**2)

In [171]:
def mineCityStarPatterns(cityName):
    data = cities[cityName]
    data = data.drop_duplicates(['latitude','longitude','type'])
    print(cityName)
    print(len(data))
    
    start = time()
    
    minLat, minLng, maxLat, maxLng =  data.latitude.min(), data.longitude.min(), data.latitude.max(), data.longitude.max()
    lowerLeft = minLat, minLng
    upperRight = maxLat, maxLng

    cellsCoords, hashTable, lngEps, latEps = initializeContainers(minLat, minLng, maxLat, maxLng)

    latCells, lngCells, _ = cellsCoords.shape
    latStep = (maxLat - minLat)/latCells
    lngStep = (maxLng - minLng)/lngCells

    hashTable, hasingTime = hashData(hashTable, data, minLat, minLng, latCells, lngCells, latStep, lngStep, latEps, lngEps)
    materTime, neighborsByAmenity = minePatterns(hashTable, cellsCoords, latCells, lngCells, latStep, lngStep, lngEps)
    
    neighborsByAmenity = {amenity:[(set([location[1] for location in instance[1]]), instance)
                                    for instance in neighborsByAmenity[amenity]]
                                    for amenity in amenities}
    
    print('Mining algorithm took', time() - start, 'to execute')
    return cityName, len(data), neighborsByAmenity, time() - start

In [168]:
neighborsByAmenity['cafe']

[]

In [220]:
city, dataLen, neighborsByAmenity, materTime = mineCityStarPatterns('Providence')

amenitiesCounts = defaultdict(lambda: 0)
for amenity in amenities:
    amenitiesCounts[amenity] = len(data[data.type == amenity])

Providence
8104
41 46m 20.0172s N, 71 28m 23.8652s W
index 6 is out of bounds for axis 1 with size 6
index 6 is out of bounds for axis 0 with size 6
Super fast hasing took 1.3940939903259277 to execute
0.000450167648435
Mining algorithm took 2.704102039337158 to execute


In [21]:
th = 0.05

def generateCandidates(colocations, amenities, patternsPrevs, k0):
    candidates = []
    for colocation in tqdm(colocations):
        for amenity in amenities[amenitiesIndex[colocation.split('.')[-1]] + 1:]:
            notPrevalent = False
            for subPattern in combinations(colocation.split('.') + [amenity], k0):
                subP = '.'.join(subPattern)
                if not subP in colocations:
                    notPrevalent = True
                    break
                if k0 > 1:
                    if patternsPrevs[k0][subP] < th:
                        notPrevalent = True
                        break
            if notPrevalent:
                continue
            candidates.append(colocation + '.' +amenity)
    return candidates

def genSecondPatterns(colocationSplit, instances):
    cliqueInstanes = []
    neighborType = colocationSplit[1]
    for starInstance in instances:
        center = starInstance[0]
        for location in starInstance[1]:
            if location[1] == neighborType:
                cliqueInstanes.append((center[0], location[0]))
    return cliqueInstanes

def genMorePatterns(colocationSplit, instances, patternsInstances):
    cliqueInstances = []
    for instance in instances:
        center = tuple([instance[0][0]])
        neighbors = instance[1]
        potentialStarNeighbors = [location for location in neighbors
                                  if location[1] in colocationSplit[1:]]
        grouping = [[location[0] for location in group]
                    for key, group in groupby(potentialStarNeighbors, lambda x: x[1])]
        referenceStars = patternsInstances[k-1]['.'.join(colocationSplit[1:])]
        cliqueInstances.extend([(center) + star for star in product(*grouping) if star in referenceStars])
    return cliqueInstances

def checkStar(instanceTypes, colocation):
    for amenity in colocation[1:]:
        if not amenity in instanceTypes:
            return False
    return True

def calculcatePatternPrevalence(colocationSplit, instances, amenitiesCounts):
    patternInstanceCounts = {}
    for instance in instances:
        for instanceID, instanceType in zip(instance, colocationSplit):
            patternInstanceCounts[(instanceID, instanceType)] = 1
    counts = Counter([amenity for (id, amenity), one in patternInstanceCounts.items()])
    ratios = [float(count)/amenitiesCounts[type] for type, count in counts.items()]
    return min(ratios)

In [190]:
city, dataLen, neighborsByAmenity, materTime = mineCityStarPatterns('Providence')
amenitiesCounts = defaultdict(lambda: 0)
for amenity in amenities:
    amenitiesCounts[amenity] = len(data[data.type == amenity])

start = time()
patterns = {1: [amenity for amenity in amenities]}
patternsInstances = {}
patternsPrevs = {1: {amenity:1 for amenity in amenities}}

k = 2
while len(patterns[k-1]):

    print('Length:', k)
    print('Generating')
    candidates = generateCandidates(patterns[k-1], amenities, patternsPrevs, k-1)
    if not len(candidates):
        break
    print('Candiates: ', len(candidates))
    starInstances = {}
    print('Getting Stars')
    for colocation in tqdm(candidates):
        colocationSplit = colocation.split('.')
        starInstances[colocation] = (colocationSplit, [instance for types, instance in neighborsByAmenity[colocationSplit[0]]
                                               if checkStar(types, colocationSplit)])

    cliques = {}
    patternsInstances[k] = {}
    patternsPrevs[k] = {}
    print('Checking Stars')
    for colocation, (colocationSplit, instances) in tqdm(starInstances.items()):
        if len(instances):
            if k == 2:
                cliqueInstances = genSecondPatterns(colocationSplit, instances)
                patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                patternsPrevs[k][colocation] = patternPrev
                if patternPrev >= th:
                    patternsInstances[k][colocation] = cliqueInstances

            else:
                cliqueInstances = genMorePatterns(colocationSplit, instances, patternsInstances)
                if len(cliqueInstances):
                    patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                    patternsPrevs[k][colocation] = patternPrev
                    if patternPrev >= th:
                        patternsInstances[k][colocation] = cliqueInstances
    patterns[k] = {clique:len(instances) for clique, instances in patternsInstances[k].items()}
    print('Patterns: ', len(patterns[k]))
    print('')
    k += 1

miningTime = time()-start
print('Mining algorithm took',miningTime , 'to execute')

Providence
8104
41 46m 20.0172s N, 71 28m 23.8652s W
index 6 is out of bounds for axis 1 with size 6
index 6 is out of bounds for axis 0 with size 6
Super fast hasing took 1.4431068897247314 to execute
0.000450167648435
Mining algorithm took 2.7834110260009766 to execute
Length: 2
Generating


100%|██████████| 74/74 [00:00<00:00, 10095.25it/s]


Candiates:  2701
Getting Stars


100%|██████████| 2701/2701 [00:00<00:00, 23732.97it/s]


Checking Stars


100%|██████████| 2701/2701 [00:00<00:00, 43641.18it/s]


Patterns:  292

Length: 3
Generating


100%|██████████| 292/292 [00:00<00:00, 17009.29it/s]


Candiates:  704
Getting Stars


100%|██████████| 704/704 [00:00<00:00, 11426.54it/s]


Checking Stars


100%|██████████| 704/704 [00:00<00:00, 2647.77it/s]


Patterns:  56

Length: 4
Generating


100%|██████████| 56/56 [00:00<00:00, 19211.60it/s]


Candiates:  10
Getting Stars


100%|██████████| 10/10 [00:00<00:00, 8111.20it/s]


Checking Stars


100%|██████████| 10/10 [00:00<00:00, 264.51it/s]


Patterns:  5

Length: 5
Generating


100%|██████████| 5/5 [00:00<00:00, 17549.39it/s]


Mining algorithm took 0.6224308013916016 to execute


In [None]:
data.apply(lambda x: getXYpos(relPoint, gp.Point(x.latitude, x.longitude)[0], axis = 1))

In [None]:
res = mineCityStarPatternsNaive('Providence')

In [218]:
def mineCityStarPatternsNaive(cityName):
    data = cities[cityName]
    data = data.drop_duplicates(['latitude','longitude','type'])
    print(cityName)
    print(len(data))
    
    minLat, minLng, maxLat, maxLng =  data.latitude.min(), data.longitude.min(), data.latitude.max(), data.longitude.max()
    cellsCoords, hashTable, lngEps, latEps = initializeContainers(minLat, minLng, maxLat, maxLng)
    relPoint = gp.Point(minLat, minLng)
    data['x'] = data.apply(lambda x: getXYpos(relPoint, gp.Point(x.latitude, x.longitude))[0], axis = 1)
    data['y'] = data.apply(lambda x: getXYpos(relPoint, gp.Point(x.latitude, x.longitude))[1], axis = 1)
    start = time()
    sortedAmenities = {}
    for amenityType, locations in data.groupby('type'):
        sortedAmenities[amenityType] = list(zip(locations.x.tolist(),
                                                locations.y.tolist(),
                                               locations.type.tolist()))
        sortedAmenities[amenityType].sort(key = lambda x : x[0])

    amenityLocations = [(key, value) for (key, value) in sortedAmenities.items()]
    amenityLocations.sort(key = lambda x: x[0])
    print('Sorting took', time()-start, 'to execute')

    neighborsByAmenity = {}
    start = time()
    for amenityTypeId, (amenityType, locations) in enumerate(amenityLocations):
        print(amenityTypeId, amenityType)
        amenityNeighbors = []
        for location in locations:
            locationNeighbors = []
            for (nextAmenityType, nextLocations) in amenityLocations[amenityTypeId+1:]:
                for neighborLocation in nextLocations:
                    if neighborLocation[0] -  location[0] < -50:
                        continue
                    if location[0] - neighborLocation[0] > 50:
                        break
                    if dist2(location, neighborLocation) <= 50:
                        locationNeighbors.append(neighborLocation)
            amenityNeighbors.append((location, locationNeighbors))
        neighborsByAmenity[amenityType] = amenityNeighbors
    print(time()-start, 'to execute')
    
    print('Mining algorithm took', time() - start, 'to execute')
    return cityName, len(data), neighborsByAmenity, time() - start

In [96]:
parsedCities = [(cityResult[0], cityResult[2]) for cityResult in resultsHuge]

In [101]:
parsedCities

[('San_Jose', 97.30674004554749),
 ('Birmingham', 37.33728098869324),
 ('Pittsburgh', 61.98917102813721),
 ('Washington', 113.27261400222778),
 ('Milwaukee', 55.38756799697876),
 ('San_Francisco', 204.99248504638672),
 ('Orlando', 65.90241193771362),
 ('Boston', 145.76894807815552),
 ('Portland', 174.08488607406616),
 ('Richmond', 30.8624210357666),
 ('Nashville', 75.48789095878601),
 ('Miami', 83.93346881866455),
 ('Austin', 104.4635260105133),
 ('Cincinnati', 48.10244083404541),
 ('Columbus', 72.25601005554199),
 ('Cleveland', 50.01270008087158),
 ('Detroit', 62.98502993583679),
 ('Lousville', 54.89523410797119),
 ('Memphis', 49.70860695838928),
 ('Oklahoma', 52.0308141708374),
 ('Riverside', 26.514925956726074),
 ('Buffalo', 20.34398102760315),
 ('San_Diego', 269.6417911052704),
 ('Jacksonville', 43.054044008255005),
 ('Baltimore', 85.42698097229004),
 ('Providence', 31.82670021057129),
 ('Chicago', 340.353355884552),
 ('Las_Vegas', 111.03325605392456),
 ('Dallas', 170.3950479030609

In [221]:
allResultsFast = []
for city, mTime in parsedCities:
    res = mineCityStarPatterns(city)
    allResultsFast.append(res)

San_Jose
35011
37 7m 27.4613s N, 122 2m 46.3477s W
index 21 is out of bounds for axis 1 with size 21
index 21 is out of bounds for axis 0 with size 21
index 21 is out of bounds for axis 0 with size 21
Super fast hasing took 5.808730125427246 to execute
0.000450518612304
Mining algorithm took 10.000575065612793 to execute
Birmingham
18703
33 23m 0.73824s N, 87 6m 9.9216s W
index 26 is out of bounds for axis 1 with size 26
index 18 is out of bounds for axis 0 with size 18
Super fast hasing took 3.2466790676116943 to execute
0.000450800036018
Mining algorithm took 5.065000057220459 to execute
Pittsburgh
17892
40 21m 39.3793s N, 80 5m 45.9888s W
index 11 is out of bounds for axis 1 with size 11
index 9 is out of bounds for axis 0 with size 9
Super fast hasing took 3.151474952697754 to execute
0.00045027629692
Mining algorithm took 5.85562801361084 to execute
Washington
24480
38 47m 30.4004s N, 77 7m 12.8039s W
index 13 is out of bounds for axis 0 with size 13
Super fast hasing took 4.19629

In [219]:
allResultsNaive = []
for city, mTime in parsedCities:
    if mTime < 200:
        res = mineCityStarPatternsNaive(city)
        allResultsNaive.append(res)

San_Jose
35011




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Sorting took 0.06441402435302734 to execute
0 accounting
1 airport
2 amusement_park
3 aquarium
4 art_gallery
5 atm
6 bakery
7 bar
8 beauty_salon
9 bicycle_store
10 book_store
11 bowling_alley
12 bus_station
13 cafe
14 car_dealer
15 car_rental
16 car_repair
17 car_wash
18 casino
19 cemetery
20 city_hall
21 clothing_store
22 construction_contractor
23 convenience_store
24 courthouse
25 dentist
26 department_store
27 doctor
28 electronics_store
29 embassy
30 finance
31 fire_station
32 florist
33 funeral_home
34 furniture_store
35 gas_station
36 grocery_or_supermarket
37 gym
38 hardware_store
39 home_goods_store
40 hospital
41 hotel_and_lodging
42 insurance_agency
43 jewelry_store
44 laundry
45 lawyer
46 library
47 liquor_store
48 local_government_office
49 locksmith
50 movie_theater
51 moving_company
52 museum
53 night_club
54 park
55 parking
56 pet_store
57 pharmacy
58 physiotherapist
59 police
60 post_office
61 real_estate_agency
62 religious_centers
63 restaurant
64 school
65 shoe_stor

In [None]:
allResults = []
for city, data in cities.items():
    try:
        city, dataLen, neighborsByAmenity, materTime = mineCityStarPatterns(city)
        amenitiesCounts = defaultdict(lambda: 0)
        for amenity in amenities:
            amenitiesCounts[amenity] = len(data[data.type == amenity])
        
        start = time()
        patterns = {1: [amenity for amenity in amenities]}
        patternsInstances = {}
        patternsPrevs = {1: {amenity:1 for amenity in amenities}}
        
        k = 2
        while len(patterns[k-1]):

            print('Length:', k)
            print('Generating')
            candidates = generateCandidates(patterns[k-1], amenities, patternsPrevs, k-1)
            if not len(candidates):
                break
            print('Candiates: ', len(candidates))
            starInstances = {}
            print('Getting Stars')
            for colocation in tqdm(candidates):
                colocationSplit = colocation.split('.')
                starInstances[colocation] = (colocationSplit, [instance for types, instance in neighborsByAmenity[colocationSplit[0]]
                                                       if checkStar(types, colocationSplit)])

            cliques = {}
            patternsInstances[k] = {}
            patternsPrevs[k] = {}
            print('Checking Stars')
            for colocation, (colocationSplit, instances) in tqdm(starInstances.items()):
                if len(instances):
                    if k == 2:
                        cliqueInstances = genSecondPatterns(colocationSplit, instances)
                        patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                        patternsPrevs[k][colocation] = patternPrev
                        if patternPrev >= th:
                            patternsInstances[k][colocation] = cliqueInstances

                    else:
                        cliqueInstances = genMorePatterns(colocationSplit, instances, patternsInstances)
                        if len(cliqueInstances):
                            patternPrev = calculcatePatternPrevalence(colocationSplit, cliqueInstances, amenitiesCounts)
                            patternsPrevs[k][colocation] = patternPrev
                            if patternPrev >= th:
                                patternsInstances[k][colocation] = cliqueInstances
            patterns[k] = {clique:len(instances) for clique, instances in patternsInstances[k].items()}
            print('Patterns: ', len(patterns[k]))
            print('')
            k += 1
    
        miningTime = time()-start
        print('Mining algorithm took',miningTime , 'to execute')
        allResults.append([city, dataLen, materTime, miningTime, materTime+miningTime, patterns, patternsInstances, patternsPrevs])
        break
    except Exception as e:
        print(e)

In [234]:
timeResults = [(res[0], res[1], res[3]) for res in allResultsFast]

In [235]:
with open('Patterns/allResultsFastMater.json','w') as outfile:
    json.dump(timeResults, outfile)

In [25]:
#Analysis

In [32]:
len(resultsHuge)

37

In [35]:
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

In [41]:
scatter = go.Scatter(
    x = [res[1]/1000.0 for res in resultsHuge],
    y = [res[2] for res in resultsHuge],
    mode = 'markers+text',
    text = [res[0] for res in resultsHuge]
)
layout = go.Layout(
    title='Materialization time',
    hovermode='closest',
    xaxis=dict(
        title='Locations, Thousands',
    ),
    yaxis=dict(
        title='Time, Seconds',
    ),
)
fig = go.Figure(data = [scatter], layout = layout)
iplot(fig)

In [42]:
scatter = go.Scatter(
    x = [res[1]/1000.0 for res in resultsHuge],
    y = [res[3] for res in resultsHuge],
    mode = 'markers+text',
    text = [res[0] for res in resultsHuge]
)
layout = go.Layout(
    title='Mining time',
    hovermode='closest',
    xaxis=dict(
        title='Locations, Thousands',
    ),
    yaxis=dict(
        title='Time, Seconds',
    ),
)
fig = go.Figure(data = [scatter], layout = layout)
iplot(fig)

In [284]:
len(allResultsFast2)

33

In [283]:
allResultsFast2 = [res for res in allResultsFast if res[0] in [resst[0] for resst in allResultsNaive]]

In [282]:
allResultsNaive[10][0]

'Miami'

In [278]:
joinRes[10][0]

'Miami'

In [250]:
joinRes = [res for res in resultsHuge if res[0] in [resst[0] for resst in allResultsNaive]]

In [290]:
scatter = go.Scatter(
    x = [res1[1]/1000.0 for res1, res2 in zip(allResultsFast2, joinRes)],
    y = [float(res1[3])/(res1[3]+res2[3]) for res1, res2 in zip(allResultsFast2, joinRes)],
    mode = 'markers',
    text = [res2[0] for res1, res2 in zip(allResultsFast2, joinRes)]
)
layout = go.Layout(
    title='Share of Materialization Time',
    hovermode='closest',
    xaxis=dict(
        title='Locations, Thousands',
    ),
    yaxis=dict(
        title='Time, Seconds',
    ),
)
fig = go.Figure(data = [scatter], layout = layout)
iplot(fig)

In [70]:
patternInstancesPerCity = [cityResult[6] for cityResult in resultsHuge]

In [77]:
def flattenElements(array):
    return [item for sublist in array for item in sublist]

In [81]:
patternTypesPerCity = [set(flattenElements([cityResult[k].keys()for k in cityResult.keys()])) for cityResult in patternInstancesPerCity]

In [315]:
bigger

[['finance.insurance_agency.lawyer.real_estate_agency.travel_agency', 1],
 ['dentist.finance.insurance_agency.lawyer.real_estate_agency', 1],
 ['accounting.insurance_agency.lawyer.parking.real_estate_agency', 1],
 ['accounting.finance.lawyer.real_estate_agency.travel_agency', 1],
 ['cafe.finance.insurance_agency.lawyer.real_estate_agency', 1],
 ['accounting.dentist.doctor.finance.lawyer', 1],
 ['accounting.finance.insurance_agency.lawyer.parking', 1],
 ['accounting.finance.lawyer.parking.real_estate_agency', 1],
 ['doctor.finance.insurance_agency.lawyer.real_estate_agency', 2],
 ['accounting.finance.insurance_agency.lawyer.travel_agency', 1],
 ['accounting.construction_contractor.finance.lawyer.real_estate_agency', 1],
 ['city_hall.courthouse.fire_station.local_government_office.police', 1],
 ['dentist.doctor.finance.insurance_agency.lawyer', 1],
 ['accounting.finance.insurance_agency.lawyer.real_estate_agency', 12],
 ['dentist.doctor.finance.lawyer.real_estate_agency', 1],
 ['finance.

In [314]:
bigger = []
for pattern, count in Counter(flattenElements(patternTypesPerCity)).items():
    if len(pattern.split('.')) > 4:
        bigger.append([pattern, count])
#         print(pattern, count)
#         break

In [None]:
13521

In [85]:
len(set.intersection(*patternTypesPerCity))

49

In [None]:
'accounting.insurance_agency.lawyer'

In [317]:
for pattern in set.intersection(*patternTypesPerCity):
    print(' '.join(pattern.split('.')))

beauty_salon clothing_store
clothing_store home_goods_store
construction_contractor restaurant
beauty_salon real_estate_agency
finance pharmacy
insurance_agency lawyer
dentist lawyer
accounting dentist
atm convenience_store
car_repair construction_contractor
insurance_agency real_estate_agency
beauty_salon restaurant
clothing_store shoe_store
beauty_salon finance
accounting doctor
beauty_salon lawyer
dentist real_estate_agency
beauty_salon doctor
accounting real_estate_agency
doctor lawyer
doctor insurance_agency
clothing_store restaurant
clothing_store jewelry_store
doctor restaurant
construction_contractor lawyer
construction_contractor finance
accounting insurance_agency
accounting restaurant
atm beauty_salon
accounting insurance_agency lawyer
accounting finance
accounting beauty_salon
dentist doctor
finance insurance_agency
beauty_salon insurance_agency
finance lawyer
lawyer real_estate_agency
finance real_estate_agency
atm restaurant
finance restaurant
atm finance
accounting lawye

# Recomendation

In [335]:
city = 'Boston'
patterns = [cityResult[5] for cityResult in resultsHuge if cityResult[0]==city][0]
patternsInstances = [cityResult[6] for cityResult in resultsHuge if cityResult[0]==city][0]
patternsPrevs = [cityResult[7] for cityResult in resultsHuge if cityResult[0]==city][0]
cliques = pd.DataFrame()
instanceId = 0
ids = []

for size, patternss in patternsInstances.items():
    cliquesSize = pd.DataFrame()
    for pattern, instances in tqdm(patternss.items()):
        cliquesPattern = pd.DataFrame()
        for instance in instances:
            instanceId += 1
            clique = cities[city][cities[city].id.isin(instance)]
            ids.extend([instanceId]*len(clique))
            cliquesPattern = cliquesPattern.append(clique)
        cliquesPattern['pattern'] = pattern
        cliquesSize = cliquesSize.append(cliquesPattern)
    cliquesSize['size'] = size
    cliques = cliques.append(cliquesSize)

cliques['instanceId'] = ids

100%|██████████| 227/227 [03:54<00:00,  1.07s/it]
100%|██████████| 22/22 [01:25<00:00,  6.50s/it]
100%|██████████| 516/516 [02:39<00:00,  1.70it/s]


In [53]:
def getPotentialPatterns(amenity, patterns, patternsPrevs):
    possiblePatterns = {}
    for size, patternTypes in patterns.items():
        if int(size) > 2:
            possiblePatterns.update({'.'.join(pattern.split('.')[1:]): patternsPrevs[size][pattern]
                                     for pattern in patternTypes if pattern.startswith(amenity)})
    return possiblePatterns

def scorePotentialPattens(potentialPatterns, clusterCliques):
    potentialPatternsInCluster = pd.DataFrame()
    scores = []
    for pattern, prevalence in potentialPatterns.items():
        print(pattern)
        patternsInCluster = clusterCliques.loc[clusterCliques.pattern == pattern]
        patternsCount = len(patternsInCluster.groupby('instanceId'))
        if patternsCount:
            scores.extend([prevalence/patternsCount]*len(patternsInCluster))
            potentialPatternsInCluster = potentialPatternsInCluster.append(patternsInCluster)
    potentialPatternsInCluster['score'] = scores
    return potentialPatternsInCluster

In [350]:
amenity = 'cafe'
clusterId = 67
clusterCliques = cliques[cliques.clusterId == clusterId]
potentialPatterns = getPotentialPatterns(amenity, patterns, patternsPrevs)
potentialPatternsInCluster = scorePotentialPattens(potentialPatterns, clusterCliques).sort_values(by='score', ascending=False)

clothing_store.lawyer
finance.insurance_agency.lawyer
finance.lawyer.real_estate_agency
clothing_store.finance
insurance_agency.lawyer
dentist.lawyer
finance.lawyer
insurance_agency.real_estate_agency
finance.real_estate_agency
finance.restaurant
real_estate_agency.restaurant
dentist.insurance_agency
lawyer.real_estate_agency
dentist.finance
clothing_store.restaurant
doctor.restaurant
clothing_store.jewelry_store
finance.insurance_agency


In [382]:
potentialPatterns

{'clothing_store.finance': 0.0811930405965203,
 'clothing_store.jewelry_store': 0.05508474576271186,
 'clothing_store.lawyer': 0.0614406779661017,
 'clothing_store.restaurant': 0.05235350624399616,
 'dentist.finance': 0.06213753106876554,
 'dentist.insurance_agency': 0.057203389830508475,
 'dentist.lawyer': 0.0614406779661017,
 'doctor.restaurant': 0.056074766355140186,
 'finance.insurance_agency': 0.09533898305084745,
 'finance.insurance_agency.lawyer': 0.07203389830508475,
 'finance.lawyer': 0.1440677966101695,
 'finance.lawyer.real_estate_agency': 0.06519453207150368,
 'finance.real_estate_agency': 0.11251314405888538,
 'finance.restaurant': 0.0893371757925072,
 'insurance_agency.lawyer': 0.1059322033898305,
 'insurance_agency.real_estate_agency': 0.058885383806519455,
 'lawyer.real_estate_agency': 0.08096740273396424,
 'real_estate_agency.restaurant': 0.074447646493756}

In [352]:
potentialPatternsInCluster.to_csv('Patterns/BostonPotentialBar.csv',index=None)

In [380]:
patternsInCluster = cliques.loc[cliques.pattern == 'cafe.clothing_store.finance']
patternsCount = len(patternsInCluster.groupby('instanceId'))

In [384]:
len(cliques.groupby('instanceId'))

258138

In [385]:
cliques.to_csv('Patterns/BostonAllCliques.csv', index = None)

In [376]:
patternsPrevs['3']['cafe.clothing_store.finance']

0.0811930405965203

In [368]:
potentialPatternsInCluster[8:11]

Unnamed: 0,latitude,longitude,type,clusterId,id,pattern,size,instanceId,score
9941,42.364275,-71.102047,clothing_store,67,9941,clothing_store.finance,2,216068,0.027064
9940,42.364143,-71.101981,finance,67,9940,clothing_store.finance,2,216068,0.027064
10027,42.366111,-71.10461,real_estate_agency,67,10027,finance.real_estate_agency,2,209011,0.014064


In [332]:
[cityResult[0] for cityResult in resultsHuge if cityResult[0]==city][0]

'Boston'

In [333]:
len(resultsHuge)

37

In [334]:
cities['Boston']

Unnamed: 0,latitude,longitude,type,clusterId,id
0,42.385209,-70.973779,real_estate_agency,0,0
1,42.385158,-70.974066,doctor,0,1
2,42.385296,-70.974064,doctor,0,2
3,42.385098,-70.973105,school,0,3
4,42.385177,-70.973779,school,0,4
5,42.384497,-70.973842,real_estate_agency,0,5
6,42.384521,-70.973152,bus_station,0,6
7,42.386828,-70.975855,real_estate_agency,0,7
8,42.386828,-70.975853,real_estate_agency,0,8
9,42.387088,-70.976300,moving_company,0,9
