# Electronic Store Data Generation

## 1- Importing libraries

In [4]:
import datetime
import calendar
import random
import numpy
import pandas as pd
import uuid

## 2- Defining functions and data lists

In [5]:
# defining product list, price and weight
products = {
  'IPhone': [700, 10],
  'Samsung Phone': [600, 12],
  'Budget Phone': [400, 3],
  '20in Monitor': [109.99,6],
  '34in Ultrawide Monitor': [379.99, 9],
  '27in 4K Gaming Monitor': [389.99,9],
  '27in FHD Monitor': [149.99, 11],
  'Flatscreen TV': [300, 7],
  'Macbook Pro Laptop': [1700, 7],
  'ThinkPad Laptop': [999.99, 6],
  'AA Batteries (4-pack)': [3.84, 30],
  'AAA Batteries (4-pack)': [2.99, 30],
  'USB-C Charging Cable': [11.95, 30],
  'Lightning Charging Cable': [14.95, 30],
  'Wired Headphones': [11.99, 26],
  'Bose SoundSport Headphones': [99.99, 19],
  'Apple Airpods Headphones': [150, 22],
  'LG Washing Machine': [600.00, 1],
  'LG Dryer': [600.00, 1]
}

columns = ['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date', 'Purchase Address']


# defining daytime function
def generate_random_time(month):
  day = generate_random_day(month)

    #Adding more weight to the evening shooping hours
  if random.random() < 0.4:
    date = datetime.datetime(2020, month, day,12,00)
  else:
    date = datetime.datetime(2020, month, day,20,00)
  time_offset = numpy.random.normal(loc=0.0, scale=180)
  final_date = date + datetime.timedelta(minutes=time_offset)
  return final_date.strftime("%y/%m/%d %H:%M")

def generate_random_day(month):
  day_range = calendar.monthrange(2020,month)[1]
  return random.randint(1,day_range)


# defining random adress generator
def generate_random_address():
  street_names = ['Main', '2nd', '1st', '4th', '5th', 'Park', '6th', '7th', 'Maple', 'Pine', 'Washington', '8th', 'Cedar', 'Elm', 'Walnut', '9th', '10th', 'Lake', 'Sunset', 'Lincoln', 'Jackson', 'Church', 'River', '11th', 'Willow', 'Jefferson', 'Center', '12th', 'North', 'Lakeview', 'Ridge', 'Hickory', 'Adams', 'Cherry', 'Highland', 'Johnson', 'South', 'Dogwood', 'West', 'Chestnut', '13th', 'Spruce', '14th', 'Wilson', 'Meadow', 'Forest', 'Hill', 'Madison']
  cities = ['San Francisco', 'Boston', 'New York City', 'Austin', 'Dallas', 'Atlanta', 'Portland', 'Portland', 'Los Angeles', 'Seattle']
  
    #adding different random weights to different cities
  weights = [9,4,5,2,3,3,2,0.5,6,3]
    
    #for simplicity only 1 zipcode per state is used
  zips = ['94016', '02215', '10001', '73301', '75001', '30301', '97035', '04101', '90001', '98101']
  state = ['CA', 'MA', 'NY', 'TX', 'TX', 'GA', 'OR', 'ME', 'CA', 'WA']

  street = random.choice(street_names)
  index = random.choices(range(len(cities)), weights=weights)[0]

  return f"{random.randint(1,999)} {street} St, {cities[index]}, {state[index]} {zips[index]}"

## 3- data generation and exporting

In [None]:
def create_data_csv():
  pass

def write_row(order_number, product, order_date, address):
  product_price = products[product][0]

    #adding a reverse relationship betwwen the product's price and the frequency it is ordered
  quantity = numpy.random.geometric(p=1.0-(1.0/product_price), size=1)[0]
  output = [order_number, product, quantity, product_price, order_date, address]
  return output

    #adding different months' weights to simulate changes in online sales during 2020
if __name__ == '__main__':
  order_number = 281234
  for month in range(1,13):
    if month <= 3:
      orders_amount = int(numpy.random.normal(loc=10000, scale=1000))
    elif 4 <= month < 9:
      orders_amount = int(numpy.random.normal(loc=16000, scale=1500))
    elif month == 10:
      orders_amount = int(numpy.random.normal(loc=19000, scale=1500)) 
    elif month == 11:
      orders_amount = int(numpy.random.normal(loc=21000, scale=2000))
    else: # month == 12
      orders_amount = int(numpy.random.normal(loc=25000, scale=2000))

    product_list = [product for product in products]
    weights = [products[product][1] for product in products]

    df = pd.DataFrame(columns=columns)
 
    #data generating
    i = 0
    while orders_amount > 0:

      address = generate_random_address()
      order_date = generate_random_time(month)

      product_choice = random.choices(product_list, weights)[0]
      df.loc[i] = write_row(order_number, product_choice, order_date, address)
      i += 1

      #Adding some products to the same order ID with random chance
      if product_choice == 'IPhone':
        if random.random() < 0.15:
          df.loc[i] = write_row(order_number, "Lightning Charging Cable", order_date, address)
          i += 1
        if random.random() < 0.05:
          df.loc[i] = write_row(order_number, "Apple Airpods Headphones", order_date, address)
          i += 1

        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 

      elif product_choice == "Samsung Phone" or product_choice == "Budget Phone":
        if random.random() < 0.18:
          df.loc[i] = write_row(order_number, "USB-C Charging Cable", order_date, address)
          i += 1
        if random.random() < 0.04:
          df.loc[i] = write_row(order_number, "Bose SoundSport Headphones", order_date, address)
          i += 1
        if random.random() < 0.07:
          df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
          i += 1 

      if random.random() <= 0.02:
        product_choice = random.choices(product_list, weights)[0]
        df.loc[i] = write_row(order_number, product_choice, order_date, address)
        i += 1
        
        #adding random row with column names as data errors 
      if random.random() <= 0.002:
        df.loc[i] = columns
        i += 1
        
        #adding random row with null values as cancelled orders or data errors
      if random.random() <= 0.003:
        df.loc[i] = ["","","","","",""]
        i += 1

      order_number += 1
      orders_amount -= 1

    month_name = calendar.month_name[month]
    df.to_csv(f"./data/Sales_{month_name}_2020.csv", index=False)
    print(f"{month_name} Complete")

January Complete
February Complete
March Complete
April Complete
May Complete
June Complete
July Complete
August Complete
September Complete
October Complete
November Complete
