# 1. Event Generator
  --------------------------------------------------------------------

Generate an events stream to simulate incoming data. In real life situations you will provide input data instead of this simulated data.

The output is a stream of events called `generated-stream`.

![Model deployment with streaming Real-time operational Pipeline](../../assets/images/model-deployment-with-streaming.png)

In [1]:
# We use the faker module to generate data, please run this cell and restart the notenook's kernel.
import sys
!{sys.executable} -m pip install faker



In [2]:
%run config.py

In [3]:
from random import randint, random
import math
import v3io.dataplane
from faker import Faker
import uuid
from datetime import datetime, timedelta
import json



STREAM_PATH =  STREAM_CONFIGS['generated-stream']['path']
SHARDS_COUNT = STREAM_CONFIGS['generated-stream']['shard_count']


def gen_postcode(is_churn):
    # if is_churn is true the postcode modulu 3 will return 0 or 1
    # if is_churn is false the postcode modulu 3 will return 0 or 2
    # this will encode information in postcode that our ML model will learn
    base_postcode = 3 * randint(3334,33333)
    group = randint(0,1)
    if is_churn:
        return base_postcode + group
    else:
        return base_postcode + (group * 2)

# event functions
def new_registration(fake, id, event_time, is_churn):
    return {'user_id': id,
            'event_type': 'registration',
            'event_time': event_time,
            'name':fake.name(),
            'date_of_birth': fake.date(),
            'street_address': fake.street_address(),
            'city': fake.city(),
            'country': fake.country(),
            'postcode': gen_postcode(is_churn),
            'affiliate_url': fake.image_url(),
            'campaign': fake.ean8()}

def new_purchase(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'purchase',
            'event_time': event_time,
            'amount': fake.randomize_nb_elements(number=50)}

def new_bet(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'bet',
            'event_time': event_time,
            'bet_amount': fake.randomize_nb_elements(number=10)}
    
def new_win(fake, id, event_time):
    return {'user_id': id,
            'event_type': 'win',
            'event_time': event_time,
            'win_amount': fake.randomize_nb_elements(number=200)}

def gen_event_date(is_churn, prev_event_date=None):
    if prev_event_date is None:
        #generate first event date
        return datetime.now() - timedelta(hours=randint(48,96))
    else:
        if prev_event_date + timedelta(hours=30) < datetime.now() and not is_churn and randint(1,1000) <= 5:
            # if the user is not churned and it is possible, generate event in the following day with prbability 0.005
            return prev_event_date + timedelta(hours=randint(15,24))
        else:
            return prev_event_date + timedelta(seconds=randint(5,100))
        
def generate_events(fake, user_ids, events_dist, num_events, is_churn):
    events = []
    for id in user_ids:
        # register
        event_time = gen_event_date(is_churn)
        reg_event = new_registration(fake, id, event_time, is_churn)
        reg_event['label'] = int(is_churn)
        events.append(reg_event)
        for _ in range(num_events):
            # generate event according to dist
            acc_prob = 0
            rand = random()
            for event_dist in events_dist:
                if rand <= event_dist['probability']+acc_prob:
                    event_time = gen_event_date(is_churn, event_time)
                    new_event = event_dist['generator'](fake, id, event_time)
                    events.append(new_event)
                    prob_threshold = 0
                    break
                else:
                    acc_prob += event_dist['probability']
    return events


# 70% churn users 
NUM_USERS_GROUP1 = 1400
NUM_USERS_GROUP2 = 600 
NUM_USERS = NUM_USERS_GROUP1+NUM_USERS_GROUP2

EVENTS_PER_USER = 1000

GROUP1_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.89, 'generator': new_bet}, 
                      {'probability': 0.01, 'generator': new_win}]

GROUP2_EVENTS_DIST = [{'probability': 0.1, 'generator': new_purchase}, 
                      {'probability': 0.85, 'generator': new_bet},
                      {'probability': 0.05, 'generator': new_win}]


## Create V3IO Client

In [4]:
v3io_client = v3io.dataplane.Client(endpoint='http://v3io-webapi:8081', access_key=V3IO_ACCESS_KEY)

## Generate Events

In [5]:
fake = Faker()

group1_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP1))
group2_user_ids = (str(uuid.uuid4()) for _ in range(NUM_USERS_GROUP2))

group1_events = generate_events(fake, group1_user_ids, GROUP1_EVENTS_DIST, EVENTS_PER_USER, True)
group2_events = generate_events(fake, group2_user_ids, GROUP2_EVENTS_DIST, EVENTS_PER_USER, False)


print(f'Events generated: {len(group1_events)+len(group2_events)}')
print(f'Events preview: {group1_events[1:5]}')

Events generated: 2002000
Events preview: [{'user_id': '5c67790a-29e2-455f-b67c-a8f67da54c2c', 'event_type': 'bet', 'event_time': '2020-08-06 21:42:29.770807', 'bet_amount': 11}, {'user_id': '5c67790a-29e2-455f-b67c-a8f67da54c2c', 'event_type': 'bet', 'event_time': '2020-08-06 21:44:08.770807', 'bet_amount': 11}, {'user_id': '5c67790a-29e2-455f-b67c-a8f67da54c2c', 'event_type': 'bet', 'event_time': '2020-08-06 21:44:42.770807', 'bet_amount': 12}, {'user_id': '5c67790a-29e2-455f-b67c-a8f67da54c2c', 'event_type': 'bet', 'event_time': '2020-08-06 21:45:49.770807', 'bet_amount': 10}]


## Write generated events to V3IO Steam

#### Transform the event to stream records

In [6]:
events = (group1_events + group2_events)
events.sort(key=lambda event: event.get('event_time'))

In [7]:
records = [{'data': json.dumps(event, default=str)} for event in events]

#### Ingest in small batches to V3IO Stream

In [8]:
batch_size = 1000
streamed_records = 0
failed_records = 0
for i in range(0, len(records), batch_size):
    resp = v3io_client.put_records(container=CONTAINER, path=STREAM_PATH, records=records[i:i+batch_size])
    streamed_records += len(json.loads(resp.body)['Records'])
    failed_records += json.loads(resp.body)['FailedRecordCount']
print(f'Successfully streamed {streamed_records}, failed to stream {failed_records}')

2020-08-09 10:44:03,931 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:03,950 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:03,968 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:03,984 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:03,999 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:04,014 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:04,029 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeError'>}
2020-08-09 10:44:04,045 [info] Disconnected while attempting to send. Recreating connection: {'e': <class 'BrokenPipeE