# Generate stores & customers for the demo

## Setup the environment

### Install packages

In [1]:
!pip install faker
!pip install pymorton
!pip install v3io
!pip install v3io_frames
!pip install boto3

Collecting faker
[?25l  Downloading https://files.pythonhosted.org/packages/52/1a/930431923062857520bae512101a648ef528cd327583fda38d9e76fab5ce/Faker-1.0.7-py2.py3-none-any.whl (874kB)
[K    100% |████████████████████████████████| 880kB 19.5MB/s 
Collecting text-unidecode==1.2 (from faker)
[?25l  Downloading https://files.pythonhosted.org/packages/79/42/d717cc2b4520fb09e45b344b1b0b4e81aa672001dd128c180fabc655c341/text_unidecode-1.2-py2.py3-none-any.whl (77kB)
[K    100% |████████████████████████████████| 81kB 22.5MB/s 
[?25hInstalling collected packages: text-unidecode, faker
Successfully installed faker text-unidecode
Collecting pymorton
  Downloading https://files.pythonhosted.org/packages/c6/8d/906ba6d4266d7696547b8b70e08423975243c7339fe1ccf4bdbc42478394/pymorton-1.0.5-py2.py3-none-any.whl
Installing collected packages: pymorton
Successfully installed pymorton-1.0.5
Collecting v3io
  Downloading https://files.pythonhosted.org/packages/bc/ef/9339c0420d559c32fe9016345c8993122690a9

### imports

In [1]:
import logging
import os
import pickle
import random
import itertools
import boto3
import botocore

# Demo
import pandas as pd
import pymorton as pm
from faker import Faker
from faker.providers import BaseProvider

# DB
import v3io as v3c
import v3io_frames as v3f

### Define environment variables

In [2]:
# Stores
NUMBER_OF_STORES=50
STORES_TABLE='stores'
ACCURACY=20
BASE_ACCURACY=14

# Customers
NUMBER_OF_CUSTOMERS=5000
CUSTOMERS_TABLE='customers'
os.environ['CUSTOMERS_DATA_FILE_DIR'] = os.path.join(os.getcwd(), 'data')
os.environ['CUSTOMERS_DATA_FILE_PATH'] = os.path.join(os.environ['CUSTOMERS_DATA_FILE_DIR'], 'data_train_n.csv')
!mkdir -p ${CUSTOMERS_DATA_FILE_DIR}

In [3]:
!curl -L "iguazio-sample-data.s3.amazonaws.com/data_train_n.csv" > ${CUSTOMERS_DATA_FILE_PATH}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  399M  100  399M    0     0  16.4M      0  0:00:24  0:00:24 --:--:-- 15.5M


In [4]:
!ls -lah ${CUSTOMERS_DATA_FILE_DIR}

total 0
drwxr-xr-x 2 50 nogroup    0 Jul 14 10:39 .ipynb_checkpoints
-rw-r--r-- 1 50 nogroup 400M Jul 14 10:53 data_train_n.csv


### Open DB client

In [6]:
client = v3f.Client('framesd:8081')

## Generate Stores

### Location provider

In [7]:
class LocationProvider(BaseProvider):
    '''
    Creates locations within base_location

    Uses QuadTree for Geohashing
        @{http://tech.taskrabbit.com/blog/2015/06/09/elasticsearch-geohash-vs-geotree/}
        @{http://mapzen.github.io/leaflet-spatial-prefix-tree/}
        @{http://blog.notdot.net/2009/11/Damn-Cool-Algorithms-Spatial-indexing-with-Quadtrees-and-Hilbert-Curves}
    '''    
    def location(self, location_base: str, base_acc=10, acc=20):
        coordinates = location_base[:base_acc]
        for i in range(acc-len(coordinates)):
            coordinates += str(random.randint(0, 3))
        return coordinates

Add the location provider as a faker provider

In [8]:
faker = Faker()
faker.add_provider(LocationProvider)

### Define scenario environment (locations)

In [9]:
london_city = (51.514926, -0.089580)
london_city_south = (51.501593, -0.094942)
london_west = (51.512309, -0.128966)
london_south_west = (51.495022, -0.162268)

london_coordinates = []
london_coordinates.append(london_city)
london_coordinates.append(london_city_south)
london_coordinates.append(london_west)
london_coordinates.append(london_south_west)

london_coordinates_qt = list(map(lambda cooridnate: pm.interleave_latlng(*cooridnate), london_coordinates))
london_coordinates_qt

['03311311313011311011000321002320',
 '03311311311233323013031101320003',
 '03311311313010023000032330133111',
 '03311311311222300331010333220231']

### Define Store class

In [10]:
class Store(object):
    def __init__(self, store_id: int, name: str, location: str):
        self.store_id = store_id if store_id is not None else random.randint(1, int(os.genev('NUMBER_OF_STORES', 500)))
        self.location = location
        (self.lat, self.long) = pm.deinterleave_latlng(location)
        self.store_name = name
        self.customers = 0
    
    def json(self):
        json_store = {
            'id': self.store_id,
            'name': self.store_name,
            'location': self.location,
            'latitude': self.lat,
            'longitude': self.long,
            'customers': self.customers
        }
        return json_store

### Create stores

In [11]:
def create_stores(faker, number_of_stores: int=500):
    remaining = number_of_stores
    stores = []
    while remaining > 0:
        # Create stores
        stores_batch = [Store(faker.msisdn(), faker.company(), faker.location(random.choice(london_coordinates_qt), BASE_ACCURACY, ACCURACY)).json() 
              for i in range(remaining)]
        
        # Count for duplicate locations
        locations = list(map(lambda store: store['location'], stores_batch))
        locations = list(dict.fromkeys(locations))
        remaining -= len(locations)
        
        # Update stores list
        stores.append(stores_batch)

    stores = list(itertools.chain.from_iterable(stores))
    stores = pd.DataFrame.from_records(stores)
    stores = stores.set_index(['location'])
    stores = stores[~stores.index.duplicated(keep='first')]
    return stores

In [12]:
client.delete('kv', STORES_TABLE, if_missing=1)

In [18]:
stores = create_stores(faker, NUMBER_OF_STORES)
stores.head(5)

Unnamed: 0_level_0,customers,id,latitude,longitude,name
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3311311313011312333,0,3843250429733,51.520042,-0.090981,Jones Ltd
3311311311233131323,0,7560016407222,51.48983,-0.088921,"Kennedy, Burns and Wolfe"
3311311313011301210,0,4389150341295,51.516266,-0.095444,Lang LLC
3311311311222212003,0,718232756938,51.49601,-0.169945,"Williams, Hobbs and Gardner"
3311311311233203201,0,6940166429223,51.49704,-0.106773,Payne PLC


### Upload to DB 

In [19]:
# client.delete('kv', STORES_TABLE)
client.write('kv', STORES_TABLE, stores)

In [20]:
client.read('kv', STORES_TABLE).head(5)

Unnamed: 0_level_0,name,customers,id,latitude,longitude
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3311311311222022002,Clark PLC,0.0,5222948143156,51.490517,-0.175781
3311311313010220203,Howard Group,0.0,4098356409669,51.522102,-0.131493
3311311313011202321,Marshall-White,0.0,6065424030842,51.519699,-0.108147
3311311313011130232,"Johnson, Gonzales and Lee",0.0,5913255917578,51.511803,-0.092697
3311311313011223103,Steele PLC,0.0,6309532093786,51.523476,-0.1054


## Generate customers

In [21]:
class Customer(object):
    def __init__(self, id=0, location='0', products=""):
        self.id = id
        self.location = location
        self.products = products
        
    def json(self):
        json_customer = {
            'id': self.id,
            'location': self.location,
            'products': self.products
        }
        return json_customer

In [22]:
def create_customers(customers_data: pd.DataFrame, number_of_customers: int):
    available_customer = list(customers_data.user_id)
    customers = [Customer(id=i,
                          products=customers_data.loc[customers_data['user_id'] == random.choice(available_customer)]\
                                    .drop(['user_id', 'Unnamed: 0'], axis=1)\
                                    .to_json()).json() for i in range(number_of_customers)]
    customers = pd.DataFrame.from_records(customers)
    customers.set_index(['id'])
    return customers

In [26]:
# Load customers seed data
customers_data = pd.read_csv(os.environ['CUSTOMERS_DATA_FILE_PATH'])

# Create actual customers
customers = create_customers(customers_data, NUMBER_OF_CUSTOMERS)
customers.head(5)

Unnamed: 0,id,location,products
0,0,0,"{""user_product_reordered_ratio"":{""324119"":1.0,..."
1,1,0,"{""user_product_reordered_ratio"":{""299728"":1.0,..."
2,2,0,"{""user_product_reordered_ratio"":{""296781"":1.0,..."
3,3,0,"{""user_product_reordered_ratio"":{""480528"":1.0}..."
4,4,0,"{""user_product_reordered_ratio"":{""9664"":1.0,""3..."


## Upload to DB

In [None]:
client.delete('kv', CUSTOMERS_TABLE, if_missing=1)
client.write('kv', CUSTOMERS_TABLE, dfs=customers)

In [45]:
print(f'Customers table sized: {client.read("kv", CUSTOMERS_TABLE).shape}')
client.read('kv', CUSTOMERS_TABLE).head(5)

Customers table sized: (2990, 3)


Unnamed: 0_level_0,products,id,location
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1926,"{""user_product_reordered_ratio"":{""1246"":1.0,""2...",1926.0,0
1517,"{""user_product_reordered_ratio"":{""178158"":1.0,...",1517.0,0
2183,"{""user_product_reordered_ratio"":{""9944"":1.0,""3...",2183.0,0
1650,"{""user_product_reordered_ratio"":{""67635"":1.0,""...",1650.0,0
1472,"{""user_product_reordered_ratio"":{""5811"":1.0,""2...",1472.0,0


## Generate predictions

In [15]:
client.delete('tsdb', 'predictions', if_missing=1)
client.create('tsdb', 'predictions', attrs={'rate': '1/s'}, if_exists=1)