In [98]:
from faker import Faker
import random as rnd
import os, argparse, logging
import pandas as pd

logging.basicConfig(format='%(asctime)s  %(levelname)-10s %(message)s', datefmt="%Y-%m-%d-%H-%M-%S",
    level=logging.INFO)

def parse_args():
  ap = argparse.ArgumentParser()
  ap.add_argument("-o", "--outpath", default=".", help="Output for csv files")
  ap.add_argument("-n", "--n-files", default=5, type=int, help="number of files to generate")
  ap.add_argument("-s", "--file-size", default=10 * 1024 * 1024, type=int, help="file size")
  return ap.parse_args()

columns = (
  ["Text01", "Text02", "Text03", "Text04", "Text05", "Text06", "IntWritable01", "Text07", "Text08"] +
  ["Text09", "Text10", "Text11", "Text12", "Text13", "LongWritable01", "LongWritable02", "LongWritable03"] +
  ["LongWritable04", "LongWritable05", "LongWritable06", "Text14", "Text15", "IntWritable02", "Text16", "Text17", "Text18", "Text19", "Text20",] +
  ["IntWritable03", "IntWritable04", "Text21", "IntWritable05", "Text22", "Text23", "Text24", "Text25", "Text26", "IntWritable06"] +
  ["IntWritable07", "Text27", "IntWritable08", "Text28", "IntWritable09", "IntWritable10", "IntWritable11", "Text29", "Text30", "Text31"]
)

fake = Faker()

In [99]:
def get_date():
  return fake.date(pattern='%m:%d:%y %H:%M:%S:%f')[:-3]

def get_dec(a, b):
  return str(fake.pydecimal(left_digits=rnd.randint(a, b), right_digits=0))[:-2].replace('-', '')

def str_exact_chars(n):
  return fake.pystr(min_chars=n, max_chars=n)

def generate_record():
  Text01 = get_date()
  Text02 = get_date()
  Text03 = get_dec(1, 15)
  Text04 = str_exact_chars(3)
  Text05 = get_dec(1, 15)
  Text06 = fake.pystr_format(string_format='###-###-###-###-##{{random_int}}')[:-4]
  IntWriteable01 = get_dec(1, 1)
  Text07 = fake.pystr_format(string_format='###-###-###-###-####{{random_int}}')[:-2]
  Text08 = fake.ipv6()
  Text09 = fake.ipv6()
  Text10 = ''
  Text11 = get_dec(5, 5)
  Text12 = get_dec(5, 5)
  Text13 = get_dec(2, 2)
  LongWritable01 = get_dec(8, 12)
  LongWritable02 = get_dec(8, 12)
  LongWritable03 = get_dec(8, 12)
  LongWritable04 = get_dec(8, 12)
  LongWritable05 = get_dec(8, 12)
  LongWritable06 = get_dec(8, 12)
  Text14 = str_exact_chars(10)
  Text15 = str_exact_chars(10)
  IntWriteable02 = fake.pyint()
  Text16 = str_exact_chars(12)
  Text17= fake.pystr_format(string_format='##{{random_int}}')
  Text18 = fake.ipv4()
  Text19 = str_exact_chars(3)
  Text20 = fake.pystr_format(string_format="???_?_??_???_????_?")
  IntWriteable03 = fake.pyint() 
  IntWriteable04 = fake.pyint() 
  Text21 = fake.pystr_format(string_format="#???###????###??")
  IntWriteable05 = fake.pyint() 
  Text22 = get_dec(1, 2)
  Text23 = ""
  Text24 = ""
  Text25 = fake.pystr_format(string_format="##|###|#####|###|##|####")
  Text26 = fake.pystr_format(string_format="##:#######:###:####")
  IntWriteable06 = fake.pyint()
  IntWriteable07 = fake.pyint()
  Text27 = fake.ipv6()
  IntWriteable08 = fake.pyint()
  Text28 = fake.ipv6()
  IntWriteable09 = fake.pyint()
  IntWriteable10 = fake.pyint()
  IntWriteable11 = fake.pyint()
  Text29 = ""
  Text30 = str_exact_chars(24)
  Text31 = ""

  data = ([Text01, Text02, Text03, Text04, Text05, Text06, IntWriteable01, Text07, Text08, Text09, Text10] + 
          [Text11, Text12, Text13, LongWritable01, LongWritable02, LongWritable03, LongWritable04, LongWritable05, LongWritable06] +
          [Text14, Text15, IntWriteable02, Text16, Text17, Text18, Text19, Text20, IntWriteable03, IntWriteable04, Text21] +
          [IntWriteable05, Text22, Text23, Text24, Text25, Text26, IntWriteable06, IntWriteable07, Text27, IntWriteable08] +
          [Text28, IntWriteable09, IntWriteable10, IntWriteable11, Text29, Text30, Text31]
          )

  return data

def generate_fake_data(out_path, file_size=1024 * 1024 * 10):
  size = 0
  data_dicts = []

  while size < file_size:

    data = generate_record()
    size = sum([len(str(x)) for x in data])
    data_dicts.append(dict(zip(columns, data)))

  df = pd.DataFrame(data_dicts)
  df.to_csv(out_path)
  logging.info(f"Generated: {out_path}")

def generate_files(out_path, n_files = 5, file_size=1024 * 1024 * 10):

  file_path = os.path.join(out_path, fake.pystr_format(string_format="??????????.csv"))

  for _ in range(n_files):
    generate_fake_data(file_path, file_size)


In [100]:
def gzip_file(file_path):

  base = os.path.splitext(file_path)[0]
  dst_path = f"{base}.gz"

  with open(file_path, "r") as fd:
    with gzip.open(dst_path, "wb") as gzfd:
      shutil.copyfileobj(fd, gzfd)

  os.remove(file_path)
  logging.info(f"Compressed: {dst_path}")

In [94]:
data

['04:04:06 06:17:23:000',
 '09:09:07 04:47:56:000',
 '237503428635467',
 'VZS',
 '93163749932',
 '069-678-346-387-77',
 '2',
 '388-630-450-251-613450',
 '68af:b5ce:3e7f:1565:3c6:dc2a:5417:2447',
 'ca82:f7ed:b38d:d4f2:cf1f:f172:a761:e223',
 '',
 '63446',
 '44989',
 '23',
 '97680902',
 '170586192',
 '78100466769',
 '4131020837',
 '622047749685',
 '229398760573',
 'hXPMiqVClQ',
 'vdFhRsuEnf',
 5676,
 'dmmGfDPnQTbW',
 '986369',
 '198.4.249.177',
 'cts',
 'Bpb_A_HH_mGq_YaYU_u',
 9605,
 3593,
 '7gvq572Bwqv923pS',
 7436,
 '11',
 '',
 '',
 '29|862|65115|481|49|5723',
 '80:9163731:799:7541',
 7914,
 4187,
 '175c:6e04:cb52:875e:8ace:d0f4:69de:105a',
 5298,
 'ac79:96f:415d:a84:7b01:eb18:8541:fe01',
 4469,
 7440,
 3138,
 '',
 'URWJjesgxDqJTeQweycMgcyq',
 '']