# Install all libraries



In [1]:
!pip install faker
!pip install numpy
!pip install pandas
!pip install tqdm

Collecting faker
  Downloading Faker-24.0.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-24.0.0


# Import all libraries

In [2]:
from google.colab import drive
from faker import Faker
import pandas as pd
import numpy as np
import json
import random
import string
from tqdm import tqdm

# Mount local drive

In [3]:
drive.mount("/content/drive")

Mounted at /content/drive


# Parameters

In [5]:
with open('drive/MyDrive/config/parameters.json', "r") as data_file:
  parameters = json.load(data_file)

PATH_OUTPUT_FILE = parameters["path_output_file"]

# dimensione massima datasets
MAX_DIM_DATASETS = parameters["max_dim_datasets"]

# numero di iban
NUM_IBAN = MAX_DIM_DATASETS // 10

# range numero entry per iban
MIN_RANGE_ENTRY = parameters["min_range_entry"]
MAX_RANGE_ENTRY = parameters["max_range_entry"]

# range titolari conto condiviso
MIN_RANGE_HOLDERS = parameters["min_range_holders"]
MAX_RANGE_HOLDERS = parameters["max_range_holders"]

# Temperature factor ---> controlla il fattore di distorsione di una stringa
T = parameters["T"]
# Changeable factor ---> controlla il fattore di aggiunta di spazi bianchi e romozione di parole
C = parameters["C"]

BIC_COUNTRY_CODES = parameters["bic_country_codes"]
FAKER_COUNTRY_CODES = parameters["faker_country_codes"]

FileNotFoundError: [Errno 2] No such file or directory: 'drive/MyDrive/config/parameters.json'

#Utility functions

In [7]:
# def load_json_file(filePath):
#   """ Load a Json file """
#   data_file = open(filePath)
#   return json.load(data_file)


def create_dataset():
  """ create a new dataset as a pandas dataframe """
  return pd.DataFrame(columns=["BIC", "AccountNumber", "CTRYbnk", "Name", "Address", "IsShared", "Holder"])


def save_dataset(dataset, filePath):
  """ Save the dataset generated """
  dataset.to_excel(filePath)


def compute_entry_range():
  """ Compute the range(min, max) of the entity """
  new_range = []
  for i in range(MIN_RANGE_ENTRY,MAX_RANGE_ENTRY):
    if i < 5:
      new_range += [i for _ in range((MAX_RANGE_ENTRY) // 2)]
    elif i > 5 and i < 15:
      new_range += [i for _ in range(MAX_RANGE_ENTRY)]
    else:
        if(i % 10 == 0): new_range.append(i)

  return new_range

## BIC and IBAN generator

In [None]:
def bic_manual_generator():
  country_code = random.choice(BIC_COUNTRY_CODES)
  bank_code = "".join([random.choice(string.ascii_uppercase) for _ in range(4)])
  location_code = random.choice(string.ascii_uppercase) + random.choice(string.ascii_uppercase+'012')
  return bank_code + country_code + location_code, country_code


def bic_generator():
  country_code = random.choice(FAKER_COUNTRY_CODES)
  fake = Faker(country_code)
  return fake.swift()


def iban_generator():
  country_code = random.choice(FAKER_COUNTRY_CODES)
  fake = Faker(country_code)
  return fake.iban()

## Company and andress generator

In [None]:
def company_generator(country_code):
  fake = Faker(country_code)
  return fake.company()


def address_generator(country_code):
  fake = Faker(country_code)
  return fake.address()


def companies_info_generator(country_code, num_companies):
  companies = dict()
  num_companies_generated = 0

  while num_companies_generated != num_companies:
    description = company_generator(country_code)
    if description not in companies:
      if np.random.randint(0,2):
        address = address_generator(country_code)
      else:
        address = ""
      companies[description] = {"num_entry": 1, "address": address}
      num_companies_generated += 1

  return companies

## Permutation generator

In [None]:
def generate_permutations(name, rowNumber):
  """ Generate aliases by introducing transcription errors.
      The number of the aliases generated depends by the
      rowNumber parametes """

  words = name.split()
  aliases = []
  newT = T

  # The name is made by more than 1 word
  if len(words) > 1:
    for i in range(rowNumber):
      check = True
      for j in range(len(words)):

        # Add additional spaces
        if check and random.random() < C:
          check = False
          alias_with_spaces = list(words)
          alias_with_spaces.insert(j, '')
          aliases.append(' '.join(alias_with_spaces))
          break

        # Remove a word
        if check and random.random() < C:
          check = False
          alias_without_word = list(words)
          del alias_without_word[j]
          aliases.append(' '.join(alias_without_word))
          break

      if check: aliases.append(name)

  # The name is a single word
  else:
    aliases = [name for i in range(rowNumber)]
    newT = T + (0.3 * T)


  # Introduce transcription errors based on T (Temperature) value
  for j,alias in enumerate(aliases):
    word = list(alias)
    random_positions = random.sample(range(len(word)), random.randint(0, len(word) // 2))
    for i in random_positions:
        if random.random() < newT: word[i] = random.choice([' ', '.', ',', '&', '-', '+'])
    aliases[j] = ''.join(word)

  aliases[0] = name
  return aliases


def generate_permutations_by_name_length(name):
  """ Generate aliases by introducing transcription errors.
      The number of the aliases generated depends by the
      length of name parametes """

  words = name.split()
  aliases = []

  for i in range(len(words)):
    for j in range(i+1, len(words)):

      # Add additional spaces
      if random.random() < C:
        alias_with_spaces = list(words)
        alias_with_spaces.insert(j, '')
        aliases.append(' '.join(alias_with_spaces))
      else: aliases.append(name)

      # Remove a word
      if random.random() < C:
        alias_without_word = list(words)
        del alias_without_word[j]
        aliases.append(' '.join(alias_without_word))
      else: aliases.append(name)


  # Introduce transcription errors based on temperature
  for j,alias in enumerate(aliases):
      word = list(alias)
      random_positions = random.sample(range(len(word)), random.randint(0, len(word)// 2))
      for i in random_positions:
        if random.random() < T: word[i] = random.choice([' ', '.', ',', '&', '-', '+'])
      aliases[j] = ''.join(word)

  aliases[0] = name
  return aliases

In [None]:
# original_name = "MICHELE COLOMBINO AND CO LTD"
# aliases = generate_permutations(original_name, 5)
# for alias in aliases: print(alias)

# Dataset generator

In [None]:
def dataset_generator():
  dataset = create_dataset()
  new_range = compute_entry_range()

  for i in tqdm(range(NUM_IBAN)):
    # generazione BIC
    bic, bic_country_code = bic_manual_generator()

    # generazione IBAN
    iban = iban_generator()

    # generazione numero di entry per questo IBAN
    # todo: controllare probabilità: più probabile medio-piccoli
    num_iban_entry = np.random.choice(new_range)

    # scelta se IBAN è condiviso e, in caso, da quanti titolari
    is_shared = np.random.randint(0,2) if num_iban_entry != 1 else 0
    if is_shared:
      num_holders = np.random.randint(MIN_RANGE_HOLDERS, num_iban_entry+1 if num_iban_entry < MAX_RANGE_HOLDERS else MAX_RANGE_HOLDERS+1)
    else:
      num_holders = 1

    # generazione nome società e eventuali indirizzi
    country_code = np.random.choice(country_codes)
    companies_info = companies_info_generator(country_code, num_companies=num_holders)

    # scelta quante entry per ogni società
    if is_shared and num_holders != num_iban_entry:
      entry_to_generate = num_iban_entry - num_holders
      while entry_to_generate != 0:
        random_holder = np.random.choice(list(companies_info.keys()))
        new_num_entry = np.random.randint(1,entry_to_generate+1) if entry_to_generate != 1 else 1
        companies_info[random_holder]["num_entry"] += new_num_entry
        entry_to_generate -= new_num_entry
    elif not is_shared:
      companies_info[list(companies_info.keys())[0]]["num_entry"] = num_iban_entry

    for name,info in companies_info.items():
      ## todo: Distocere gli indirizzi
      if info["num_entry"] != 1:
        aliases = generate_permutations(name, info["num_entry"])
        for alias in aliases:
          dataset.loc[len(dataset.index)] = [bic, iban, bic_country_code, alias, companies_info[name]["address"], is_shared, name]
      else:
        dataset.loc[len(dataset.index)] = [bic, iban, bic_country_code, name, companies_info[name]["address"], is_shared, name]

    # print("bic\t" + bic)
    # print("iban\t" + iban)
    # print("num_iban_entry\t" + str(num_iban_entry))
    # print("is_shared\t" + str(is_shared))
    # print("num_holders\t" + str(num_holders))
    # for c in companies_info.items():
    #   print(c)

  return dataset

In [None]:
dataset = dataset_generator()

In [None]:
path = "/content/drive/MyDrive/"
datasetName = "datasetProva.xlsx"
save_dataset(dataset, path + datasetName)