In [None]:
import boto
from boto.s3.key import Key
import requests
key = 'enter s3 key'
secret = 'enter s3 secret'
import s3fs
import numpy as np
import pandas as pd
import random
import string
import datetime
import random


def generate_boolean():
    return bool(random.getrandbits(1))

def generate_number(minx=0, maxx=100, is_int=False):
    return random.randint(minx, maxx) if is_int else random.uniform(minx, maxx)

def generate_string_data(length):
    return ''.join(random.sample(string.ascii_letters, length))

def generate_hash(length):
    return ''.join(random.sample(string.ascii_letters + string.digits, length))

def generate_list_hash(counts, length):
    return [
        generate_hash(length)
        for _ in range(counts)
    ]


def generate_array_data_type(item_type, n_items=1, minx=0, maxx=100, length=10, item_predefined_values=[],
                            k = 9, theta = 0.5,mu = 0,sigma = 0):
    if item_type == "string":
        return [generate_string_data(length) for _ in range(n_items)]
    elif item_type == "integer":
        return [generate_number(minx, maxx, is_int=True) for _ in range(n_items)]
    elif item_type in {"float", "number"}:
        return [generate_number(minx, maxx, is_int=False) for _ in range(n_items)]
    elif item_type == "boolean":
        return [generate_boolean() for _ in range(n_items)]
    elif item_type == "enum":
        return [random.choice(item_predefined_values) for _ in range(n_items)]
    elif item_type == "gamma":
        return [round(np.random.gamma(k,theta)) for _ in range(n_items)]
    elif item_type == "normal":
        return [round(np.random.normal(mu,sigma)) for _ in range(n_items)]
    else:
        raise UserError("{} is not supported".format(item_type))
        
def generate_dates(start_date, end_date, size):
    """
    Generate random dates within range between start and end.
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    divide_by = 24 * 60 * 60 * 10 ** 9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")
        
def generate_predefined_list(size, item_predefined_values, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # default probabilities
        p = (0.33, 0.33, 0.34)
    return np.random.choice(item_predefined_values, size=size, p=p)



In [None]:

class synthetic_data_simulation():
    """
    to simulate transaction, demographics and campaign data
    """

    def __init__(self,customer_id,size,transaction_file,gender = ['Male','Female','Unknown'],gender_prob = [0.40,0.45,0.15],
                 age = ['< 15', '16 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60+','Unknown'],
                 age_prob = [0.01, 0.03, 0.28, 0.10, 0.30, 0.08, 0.05, 0.15], loyalty_member_flag = [0,1],
                 loyalty_member_flag_prob = [0.08,0.92], loyalty_member_program = ["Member","Insider",
                "Influencer", "Ambassador"],loyalty_member_program_prob = [0.06, 0.09, 0.23, 0.62],
                 income_group = ["Less than $49,999", "$50,000 - $74,999", "$75,000 - $99,999",
                "$100,000 - $149,999","$150,000 - $199,999", "$200,000+", "Unknown"], 
                  income_group_prob = [0, 0.01, 0.05, 0.17, 0.22, 0.45, 0.10],store_visit_k = 14,
                  store_visit_theta = 0.5,review_k = 8, review_theta = 0.5, client_segment = "VIP",
                 attempted_segment_split =  "VIP", 
                  signup_date_min = '2015-12-01',signup_date_max = '2017-12-31',
                 
                tx_k = 12, tx_theta = 0.5,order_value_m = 2750,order_value_sd =2750/5,
                             scale_2018_factor = 0.95,scale_2020_factor = 1.05,
                 
                n_campaign_cust = 15000,campaign ='Fall19 Fashion and NYC Campaign',
                offer_id = [1,2,3,4,5],offer_name =  ['3 for 2 Accessories', 'Relaunch Offer – 25% Off',
                '3 for $ 250 Chinos','Free gift on your Birthday','Earn more points'],
                offer_prob = [0.25,0.2,0.15,0.10,0.30],offer_reward = [50,50,50,50,50],
                top_2_offer = [1,5],uplift_feature = ['gender','age_group'],
                uplift_feature_value = ['Female','40-39'],uplift_value = [.20,.15],base_uplift = 0.4,
                campaign_launch_date = '2019-02-01',campaign_file = None,item_list = None):

        """"
        This class generates synthetic transaction data at customer level
        size : # customers 
        customer_id : a list of unique customer ids belonging to these customers
        gender : list of genders
        gender_prob : list of probabilities of gender distribution in same order as gender
        age : list of age buckets
        age_prob : list of probabilities of age distribution in same order as age buckets
        loyalty_member_flag : flag 1 if customer is part of loyalty program
        loyalty_member_flag_prob : probability distribution of loyalty member flag
        loyalty_member_program : list of loyalty programs by business
        loyalty_member_program_prob : probability distribution of loyalty member programs in same order
        income_group : list of income groups
        income_group_prob : list of probabilities of income distribution in same order as income group
        store_visit_k : shape parameter K for store visits (gamma distribution)
        store_visit_theta : shape parameter theta for store visits (gamma distribution)
        review_k : shape parameter K for reviews (gamma distribution)
        review_theta : shape parameter theta for reviews (gamma distributi 
        client_segment : one of "Premium", "VIP", "Need Based", "Promising", "Gifters"
        attempted segment split : sub segment for e.g. "Luxury Event Seekers", "Designer Brand Seekers" etc.
        sign_up_date_min : min signup date
        sign_up_date_max : max signup date of customers

        tx_k : shape parameter K for # transactions by customer (gamma distribution)
        tx_theta : shape parameter theta for # transaction by customer (gamma distributi 
        order_value_m : mean order value (normal distribution) (year 2019)
        order_value_sd : sd of order value (normal distribution) (year 2019)
        scale_2018_factor : scaling up or down AOV for year 2018 
        scale_2020_factor : scaling up or down AOV for year 2020

        returns : customer transaction data

        n_campaign_cust: # customers part of any campaign
        campaign : name of campaign (only 1 is supported)
        offer_id : list of offers part of the campaign
        offer_name : name of offers
        offer_prob : prob dist of offers
        offer_reward : cost of each offer
        top_2_offer : list of top 2 offers for uplift purposes
        uplift_feature_value : important features to predict uplift
        uplift_value : value of uplift (in %)
        campaign_launch_date : launch of campaing (first date of any month)

        """    
        self.size = size
        self.customer_id = customer_id
        self.gender = gender
        self.gender_prob =gender_prob
        self.age = age
        self.age_prob = age_prob
        self.loyalty_member_flag = loyalty_member_flag
        self.loyalty_member_flag_prob = loyalty_member_flag_prob
        self.loyalty_member_program = loyalty_member_program
        self.loyalty_member_program_prob = loyalty_member_program_prob
        self.income_group = income_group
        self.income_group_prob = income_group_prob
        self.store_visit_k = store_visit_k
        self.store_visit_theta = store_visit_theta
        self.review_k = review_k
        self.review_theta = review_theta
        self.client_segment = client_segment
        self.attempted_segment_split = attempted_segment_split
        self.signup_date_min = signup_date_min
        self.signup_date_max = signup_date_max
        
        self.tx_k = tx_k
        self.tx_theta = tx_theta
        self.order_value_m = order_value_m
        self.order_value_sd =order_value_sd
        self.scale_2018_factor = scale_2018_factor
        self.scale_2020_factor = scale_2020_factor
        self.n_campaign_cust = n_campaign_cust
        self.campaign = campaign
        self.offer_id = offer_id
        self.offer_name =offer_name
        self.offer_prob = offer_prob
        self.offer_reward = offer_reward
        self.top_2_offer = top_2_offer
        self.uplift_feature = uplift_feature
        self.uplift_feature_value = uplift_feature_value
        self.uplift_value = uplift_value
        self.base_uplift = base_uplift
        self.campaign_launch_date = campaign_launch_date
        self.item_list = item_list
        self.transaction_file = transaction_file
        self.campaign_file = campaign_file
    
    def generate_customer_data(self):

        start_time = datetime.datetime.now()

        df = pd.DataFrame(columns=['customer_id', 'gender', 'age_group', 'city', 
                                   'loyalty_member_flag', 'loyalty_group', 'income_group',
                                   'store_visits', 'reviews', 'client_segment', 'signup_date',
                                   'attempted_segment_split'])

        # Below proportions are tuned to Decliner Premium Designer Brand Seeker sub-segments
        df['customer_id'] = self.customer_id
        df['gender'] = generate_predefined_list(self.size,self.gender,self.gender_prob)

        df['age_group'] = generate_predefined_list(self.size,self.age,self.age_prob)

        city = pd.read_csv('C:/Users/mm13690/Documents/codes/p_ai/input/us_city.csv')
        city = city.drop_duplicates('city').head()
        city['p'] = city['pop']/city['pop'].sum()

        df['city'] = generate_predefined_list(self.size,city['city'].tolist(),city['p'].tolist())

        # join state


        df = df.merge(city[['city','state']], on = 'city',how = 'inner')


        df['loyalty_member_flag'] = generate_predefined_list(self.size, self.loyalty_member_flag,
                                                             self.loyalty_member_flag_prob)


        df['loyalty_group'] = df.loyalty_member_flag.apply(
            lambda x: np.random.choice(self.loyalty_member_program,
                                       size=1,
                                       p=self.loyalty_member_program_prob)[0]
            if x == 1 else None
        )



        df['income_group'] = generate_predefined_list(self.size,self.income_group,self.income_group_prob)

        df['store_visits'] = generate_array_data_type("gamma", n_items=self.size, k = self.store_visit_k,
                                                      theta= self.store_visit_theta)

        df['reviews'] = generate_array_data_type("gamma", n_items=self.size, k= self.review_k, theta= self.review_theta)
        df['client_segment'] = self.client_segment
        df['attempted_segment_split'] = self.attempted_segment_split
        df['signup_date'] = generate_dates(self.signup_date_min, self.signup_date_max, size=self.size) # all customers have signup before 2018
        end_time = datetime.datetime.now()
        print("Time taken to create demographic data having {} customers {}".format(df.shape[0],end_time - start_time))
        return df
    
    def generate_transaction_data(self,df):
        """
        df :  customer demographics data

        """
        global get_transaction
        start_time = datetime.datetime.now()
        temp = pd.DataFrame(columns=['customer_id','order_value','order_date'])
        order_value = []
        order_date = []
        customer_id = []
        item_id = []
        customers = df.customer_id.tolist()
        for customer in customers:
            
            
            n_tx = round(np.random.gamma(self.tx_k,self.tx_theta))
            # enter mean and sd of Avg order value for this segment

            # simulate data - 2019
            _order_value = generate_array_data_type("normal", n_items=n_tx, mu = self.order_value_m, sigma = self.order_value_sd)
            _order_date = generate_dates('2019-01-01', '2019-12-31', size=n_tx)
            _item_id = generate_array_data_type("enum", n_items=n_tx,item_predefined_values=self.item_list)
            order_value.extend(_order_value)
            order_date.extend(_order_date)
            item_id.extend(_item_id)
            _customer_id = generate_array_data_type("enum", n_items=n_tx,item_predefined_values=[customer])
            customer_id.extend(_customer_id)

            # simulate data - 2018
            _order_value = [ x*np.random.normal(self.scale_2018_factor,self.scale_2018_factor/8) for x in _order_value]  # x % decline/incline in 2018
            _order_date = generate_dates('2018-01-01', '2018-12-31', size=n_tx)
            _item_id = generate_array_data_type("enum", n_items=n_tx,item_predefined_values=self.item_list)
            order_value.extend(_order_value)
            order_date.extend(_order_date)
            item_id.extend(_item_id)

            _customer_id = generate_array_data_type("enum", n_items=n_tx,item_predefined_values=[customer])
            customer_id.extend(_customer_id)

            # simulate data - 2020
            n_tx = round(n_tx/2)
            _order_value = [ x*np.random.normal(self.scale_2020_factor,self.scale_2020_factor/8) for x in _order_value][:n_tx] 
            _order_date = generate_dates('2020-01-01', '2020-12-31', size=n_tx)
            _item_id = generate_array_data_type("enum", n_items=n_tx,item_predefined_values=self.item_list)
            order_value.extend(_order_value)
            order_date.extend(_order_date)
            item_id.extend(_item_id)

            _customer_id = generate_array_data_type("enum", n_items=n_tx,item_predefined_values=[customer])
            customer_id.extend(_customer_id)



        temp['customer_id'] = customer_id
        temp['order_value'] = order_value
        temp['order_date'] = order_date
        temp['item_id'] = item_id

        customer_transaction = temp.merge(df,on = 'customer_id', how = 'left')  

        customer_transaction['month'] = pd.to_datetime(customer_transaction['order_date']).dt.month
        customer_transaction['year'] = pd.to_datetime(customer_transaction['order_date']).dt.year
        end_time = datetime.datetime.now()
        print("Time taken tp create trasaction data having {} customer {}".format(df.shape[0],end_time - start_time))
        return customer_transaction
    

    def generate_campaign_data(self,df,customer_transaction):
        """
        df: customer demographics data
        customer_transaction: transaction data
        n_campaign_cust: # customers part of any campaign
        campaign : name of campaign (only 1 is supported)
        offer_id : list of offers part of the campaign
        offer_name : name of offers
        offer_prob : prob dist of offers
        offer_reward : cost of each offer
        top_2_offer : list of top 2 offers for uplift purposes
        uplift_feature_value : important features to predict uplift
        uplift_value : value of uplift (in %)
        campaign_launch_date : launch of campaing (first date of any month)
        """

        start_time = datetime.datetime.now()

        campaign_customer = df.sample(n = self.n_campaign_cust)
        campaign_customer['campaign'] = self.campaign
        offer_data = pd.DataFrame({'offer_id': self.offer_id,
                                  'offer_name': self.offer_name,
                                  'reward':self.offer_reward})

        campaign_customer['offer_id'] = generate_predefined_list(self.n_campaign_cust,self.offer_id,self.offer_prob)
        campaign_customer = campaign_customer.merge(offer_data[['offer_id','offer_name']], on = 'offer_id', how = 'left')

        # simulate uplift by static features
        campaign_customer['uplift'] = .04
        campaign_customer['uplift'][(campaign_customer[self.uplift_feature[0]] == self.uplift_feature_value[0]) & (campaign_customer['offer_id'] == self.top_2_offer[0])] = self.uplift_value[0]
        campaign_customer['uplift'][(campaign_customer[self.uplift_feature[0]] == self.uplift_feature_value[0]) & (campaign_customer['offer_id'] == self.top_2_offer[1])] = self.uplift_value[1]
        campaign_customer['offer_launch'] = self.campaign_launch_date
        campaign_customer['month'] = pd.to_datetime(campaign_customer['offer_launch']).dt.month
        campaign_customer['year'] = pd.to_datetime(campaign_customer['offer_launch']).dt.year

        campaign_customer_converted = campaign_customer.merge(customer_transaction[['customer_id','item_id','month','year']].drop_duplicates(),on = ['customer_id','month','year'], how = 'inner')
        campaign_customer_converted.drop_duplicates('customer_id',inplace = True )
        campaign_customer_converted['offer_converted'] = 1
#         print("check1",campaign_customer.shape)
        campaign_customer = campaign_customer.merge(campaign_customer_converted[['customer_id','item_id','offer_converted']], on = 'customer_id',how = 'left')
        campaign_customer.reset_index(drop = True,inplace = True)
#         print("check2",campaign_customer.shape)
#         print(campaign_customer.tail())
        campaign_customer.loc[campaign_customer['item_id'].isna(),'item_id']   = generate_array_data_type("enum", n_items=campaign_customer[campaign_customer['item_id'].isna()].shape[0],item_predefined_values=self.item_list)
#         print("check3",campaign_customer.shape)        
        campaign_customer.fillna(0, inplace = True)
#         print("check4",campaign_customer.shape) 
        campaign_customer['offer_sent'] = 1
        

        customer_transaction = customer_transaction.merge(campaign_customer[['customer_id','uplift',
                                                        'month','year','offer_sent','offer_id','offer_name','campaign','offer_converted']], on = 
                                                         ['customer_id','month','year'], how = 'left')

        customer_transaction.fillna(0,inplace = True)


        # update order value with uplift values and check incremental revenue
        month = pd.to_datetime(self.campaign_launch_date).month
        year = pd.to_datetime(self.campaign_launch_date).year
    #     print(customer_transaction.order_value.sum())
        def update_order(x):
            if (x['month'] == month) & (x['year'] == year):
                if x['offer_sent'] ==1:
                    x['order_value'] = x['order_value']*(1+x['uplift'])
            return x['order_value']

        customer_features = df.columns
        customer_transaction['order_value'] =  customer_transaction.apply(update_order,axis = 1)
        customer_transaction_agg = customer_transaction[(customer_transaction['month'] == month) & (customer_transaction['year'] == year)].groupby(list(customer_features)).agg({'order_value': 'sum'}).reset_index()
        
        campaign_customer = campaign_customer.merge(customer_transaction_agg[['customer_id','order_value']],on = 'customer_id',how = 'left')
        cust_ids_no_campaign = np.setdiff1d(customer_transaction_agg.customer_id,campaign_customer.customer_id).tolist()

        control_customer = customer_transaction_agg[customer_transaction_agg.customer_id.isin(cust_ids_no_campaign)]
        
        campaign_customer = campaign_customer.append(control_customer,ignore_index = True)
#         print("check5",campaign_customer.shape)
#         print(campaign_customer['item_id'].tail())
        campaign_customer.loc[campaign_customer['item_id'].isna(),'item_id']   = generate_array_data_type("enum", n_items=campaign_customer[campaign_customer['item_id'].isna()].shape[0],item_predefined_values=self.item_list)
        campaign_customer.fillna(0,inplace = True)
#         print("check6",campaign_customer.shape)
        customer_transaction.drop(columns=['uplift'], inplace = True)
    #     print(customer_transaction.order_value.sum())

        end_time = datetime.datetime.now()
        print("Time taken to create campaign data for {} customer {}".format(df.shape[0],end_time - start_time))

        return customer_transaction,campaign_customer

    
    def write_to_s3(self,customer_transaction,campaign_customer = None):
        start_time = datetime.datetime.now()

        # write data to s3

        bytes_to_write = customer_transaction.to_csv(None).encode()
        fs = s3fs.S3FileSystem(key=key, secret=secret)
        with fs.open('s3://zs-atp-pzai-general/data/data_for_integrated_story/premium/' + str(self.transaction_file) +'.csv', 'wb') as f:
            f.write(bytes_to_write)
        if campaign_customer is None:
            pass
        else:
            bytes_to_write = campaign_customer.to_csv(None).encode()
            fs = s3fs.S3FileSystem(key=key, secret=secret)
            with fs.open('s3://zs-atp-pzai-general/data/data_for_integrated_story/premium/' + str(self.campaign_file) +'.csv', 'wb') as f:
                f.write(bytes_to_write)
            

        end_time = datetime.datetime.now()
        print("Time taken for writing data to s3 with {} records {}".format(customer_transaction.shape[0],end_time - start_time))


In [1]:
# the list of items to select from transactions
random.seed(42)
# read a list of customer ids

df = pd.read_csv('C:/Users/mm13690/Documents/codes/p_ai/input/premium_customer_ids.csv')
premium_customer_ids = df.customer_ids.tolist()

# generate a list of item ids
item_list = [i for i in range(1,53433)]

##for designer brand seekers
# item_list = [i for i in range(35184,39499)]
# random_item_list = [i for i in range(1,10000)]
# random_item_list = np.random.choice(random_item_list,size = 1500)
# item_list.extend(random_item_list)

hello world


In [None]:

random.seed(42)

premium_customer_ids_chunks = np.array_split(premium_customer_ids,50)


In [None]:


# create data for designer brand seekers
designer_brand_seekers = premium_customer_ids_chunks[0]
# loyalists_customer_id = _list
transaction_file = 'dbs_transaction'
campaign_file = 'dbs_campaign'

obj = synthetic_data_simulation(customer_id=designer_brand_seekers,size = len(designer_brand_seekers),transaction_file = transaction_file,
                campaign_file = campaign_file,item_list= item_list,gender = ['Male','Female','Unknown'],
                gender_prob = [0.18,0.71,0.11],
                age = ['< 15', '16 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '60+','Unknown'],
                age_prob = [0.01, 0.03, 0.24, 0.31, 0.14, 0.08, 0.02, 0.17],
                loyalty_member_flag = [0,1],
                loyalty_member_flag_prob = [0.12,0.88],
                loyalty_member_program = ["Member","Insider","Influencer", "Ambassador"],
                loyalty_member_program_prob = [0.16, 0.24, 0.26, 0.34],
                income_group = ["Less than $49,999", "$50,000 - $74,999",
                "$75,000 - $99,999","$100,000 - $149,999","$150,000 - $199,999", "$200,000+", "Unknown"], 
                income_group_prob = [0, 0.0, 0.05, 0.10, 0.15, 0.60, 0.10],
                store_visit_k = 20,store_visit_theta = 0.5,
                review_k = 6, review_theta = 0.5,
                client_segment = "Premium",
                attempted_segment_split =  "Designer Brand Seekers", 
                signup_date_min = '2015-12-01',signup_date_max = '2017-12-31',

                tx_k = 10, tx_theta = 0.5,order_value_m = 1186,order_value_sd =1186/5,
                scale_2018_factor = 1.04,scale_2020_factor = 0.95,
                
                n_campaign_cust = 15000,campaign ='Valentines Day Gemstone Jewelry Collection',
                offer_id = [1,2,3,4,5,6,7,8],offer_name =  ['3 for 2 Accessories', 'Relaunch Offer – 25% Off',
                '3 for $ 250 Chinos','Free gift on your Birthday','Earn more points','Designer Suits $199.99',
                'Dress shirt and ties extra 20% off','First to shop select brands'],
                offer_prob = [0.125,0.125,0.125,0.125,0.125,0.125,0.125,0.125],offer_reward = [50,50,50,50,50,50,50,50],
                top_2_offer = [3,6],
                uplift_feature = ['loyalty_group','age_group'],
                uplift_feature_value = ["Ambassador",'60+'],
                uplift_value = [.20,.15],base_uplift = 0.04,
                campaign_launch_date = '2019-02-01')

df = obj.generate_customer_data()
customer_transaction = obj.generate_transaction_data(df)
customer_transaction, campaign_customer = obj.generate_campaign_data(df,customer_transaction)
obj.write_to_s3(customer_transaction, campaign_customer = campaign_customer)
    

