In [None]:
# Import libraries
import datetime
import calendar
import random
import os
import numpy as np
import pandas as pd

In [None]:
# Save files path
path = "./Sales Data Examples"
os.chdir(path)

In [None]:
# Data header
columns = ['Order ID', 'Product', 'Quantity', 'Price', 'Order Date', 'Purchase Address']

# Location
street_names = ['1st', ' 2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th', '11th', '12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th', '20th', '21st', '22nd', '23rd', '24th', '25th']
districts = ['01', '02', '03', '04', '05', '06', '07', '08', '10', '11']
districts_weights = [9, 3, 4, 3, 5, 2, 6, 0.5, 3, 1]  

# Product (Price and Frequency)
products = {
  'iPhone': [700, 10],
  'Google Phone': [600, 3],
  'Samsung Phone': [500, 8],
  '21in Monitor': [110, 6],
  '27in Monitor': [150, 11],
  '34in 2k Monitor': [380, 9],
  '27in 4K Gaming Monitor': [400, 9],
  'Dell Laptop': [800, 7],
  'ThinkPad Laptop': [1000, 6],
  'Macbook Pro Laptop': [1700, 7],
  'Power Bank': [60, 30],
  'USB-C Charging Cable': [12, 30],
  'Lightning Charging Cable': [15, 30],
  'Normal Headphones': [12, 30],
  'Sony Headphones': [100, 19],
  'Airpods Headphones': [150, 15]
  }

In [None]:
# Generate Data function
def random_date(month):
  date_range = calendar.monthrange(2019, month)[1]
  return random.randint(1, date_range)

def random_time(month):
  day = random_date(month)
  if random.random() < 0.5:
    date = datetime.datetime(2019, month, day, 12, 00)
  else:
    date = datetime.datetime(2019, month, day, 20, 00)
  time_offset = np.random.normal(loc=0.0, scale=180)
  final_date = date + datetime.timedelta(minutes=time_offset)
  return final_date.strftime("%d/%m/%y %H:%M")

# Generate Location function
def random_address():
  street = random.choice(street_names)
  index = random.choices(range(len(districts)), weights=districts_weights)[0]
  return f"No.{random.randint(1,100)} {street} Street, District {districts[index]}, HCMC"

# Generate record function
def generate_record(order_number, product, order_date, address):
  price = products[product][0]
  quantity = np.random.geometric(p=1.0-(1.0/price), size=1)[0]
  output = [order_number, product, quantity, price, order_date, address]
  return output

In [None]:
# Start with PO 140620 (June 14th, 2020)
order_number = 140620

# Should be a big spike in last two months
for month in range(1, 13):
  if month <= 10:
    row_numbers = int(np.random.normal(loc=12000, scale=4000))
  elif month == 11:
    row_numbers = int(np.random.normal(loc=20000, scale=3000))
  else: # December
    row_numbers = int(np.random.normal(loc=26000, scale=3000))

  product_list = [product for product in products]
  product_weights = [products[product][1] for product in products]

  df = pd.DataFrame(columns=columns)

  i = 0
  while row_numbers > 0:

    address = random_address()
    order_date = random_time(month)

    product_choice = random.choices(product_list, product_weights)[0]
    df.loc[i] = generate_record(order_number, product_choice, order_date, address)
    i += 1

    # Add some co-bought items
    if product_choice == 'iPhone':
      if random.random() < 0.15:
        df.loc[i] = generate_record(order_number, "Lightning Charging Cable", order_date, address)
        i += 1
      if random.random() < 0.07:
        df.loc[i] = generate_record(order_number, "Power Bank", order_date, address)
        i += 1   
      if random.random() < 0.05:
        df.loc[i] = generate_record(order_number, "Airpods Headphones", order_date, address)
        i += 1

    elif product_choice == "Google Phone" or product_choice == "Samsung Phone":
      if random.random() < 0.18:
        df.loc[i] = generate_record(order_number, "USB-C Charging Cable", order_date, address)
        i += 1
      if random.random() < 0.07:
        df.loc[i] = generate_record(order_number, "Power Bank", order_date, address)
        i += 1 
      if random.random() < 0.04:
        df.loc[i] = generate_record(order_number, "Sony Headphones", order_date, address)
        i += 1

    if random.random() <= 0.02:
      product_choice = random.choices(product_list, product_weights)[0]
      df.loc[i] = generate_record(order_number, product_choice, order_date, address)
      i += 1

    # Add some outliers
    if random.random() <= 0.002:
      df.loc[i] = ["Null", "Null", "Null", "Null", "Null", "Null"]
      i += 1

    if random.random() <= 0.003:
      df.loc[i] = ["","","","","",""]
      i += 1

    order_number += 1
    row_numbers -= 1

  month_name = calendar.month_name[month]
  df.to_csv(f"Sales_2019_{month_name}.csv", index=False, encoding='utf-8')
  print(f"{month_name} Done!")