In [22]:
import pandas as pd
import numpy as np
import random, string
from collections import Counter
from unidecode import unidecode

# customer table

In [2]:
random.seed(123)
n = 1329 #number of customers

Data taken from GUS - Główny Urząd Statystyczny https://www.gov.pl/
* last_names_F.csv - popular female surnames in Poland in 2022
* last_names_M.csv - popular male surnames in Poland in 2022
* first_names_F.csv - popular female names in Poland in 2022
* first_names_M.csv - popular male names in Poland in 2022

In [3]:
ln_F = pd.read_csv('last_names_F.csv', sep = ",")
ln_F = ln_F.rename(columns={'Liczba': 'Number'}) # Actions taken to unify the structure of csv files
ln_F = ln_F.rename(columns={'Nazwisko aktualne': 'Name'})

ln_M = pd.read_csv('last_names_M.csv', sep = ",")
ln_M = ln_M.rename(columns={'Liczba': 'Number'})
ln_M = ln_M.rename(columns={'Nazwisko aktualne': 'Name'})

fn_F = pd.read_csv('first_names_F.csv', sep = ",")
fn_F = fn_F.rename(columns={'LICZBA WYSTĄPIENIEŃ': 'Number'})
fn_F = fn_F.rename(columns={'IMIĘ PIERWSZE': 'Name'})

fn_M = pd.read_csv('first_names_M.csv', sep = ",")
fn_M = fn_M.rename(columns={'LICZBA WYSTĄPIEŃ': 'Number'})
fn_M = fn_M.rename(columns={'IMIĘ PIERWSZE': 'Name'})

In [4]:
def normalise(n_x):
    """
    A function that creates a probability vector for the occurrence of data.
    
    Args:
        n_x: a data table containing a vector of their occurrences
        
    Returns:
        P_n_x: probability vector for the occurrence of data
    """
    
    P_n_x = [n_x[i]/np.sum(n_x) for i in range(len(n_x))]
    P_n_x = P_n_x/ min(P_n_x)
    P_n_x = P_n_x/ np.sum(P_n_x)
    return P_n_x

In [5]:
def first_and_last_name(n, n_F, n_M, probab):
    """
    A function that creates a vector of names, containing male and
    female names in the proportion given by the probability vector.
    
    Args:
        n: vector length
        n_F: a table of female names containing a vector of their occurrences
        n_M: a table of male names containing a vector of their occurrences
        probab: a vector containing the probability of occurrence of female and male names 
        
    Returns:
        name: vector of names
    """
    
    N_F = [str(n_F['Name'][i]).lower().capitalize() for i in range(n_F.shape[0])]
    N_M = [str(n_M['Name'][i]).lower().capitalize() for i in range(n_M.shape[0])]

    name = np.zeros(n).astype(str)

    x = np.random.choice(["F", "M"], size=n, replace=True, p=probab) 
    name_F = np.random.choice(N_F, size=n, replace=True, p=normalise(n_F['Number']))
    name_M = np.random.choice(N_M, size=n, replace=True, p=normalise(n_M['Number']))

    for i in range(n):
        if x[i] == "F":
            name[i] = name_F[i]
        else:
            name[i] = name_M[i]
            
    return name

* total number of inhabitants of Wrocław: 674 079
* number of female inhabitants of Wrocław: 359537 (53%)
* number of male inhabitants of Wrocław: 314542 (47%)

In [6]:
first_name = first_and_last_name(n, fn_F, fn_M, probab=[0.53, 0.47])
last_name = first_and_last_name(n, ln_F, ln_M, probab=[0.53, 0.47])
# proba=[0.53, 0.47] for the appropriate denominator of women to men
# values selected on the basis of the number of female and male inhabitants of Wrocław

In [7]:
customer_id = range(1,n+1)
dict_customer = {'customer_id':customer_id, "first_name": first_name, "last_name": last_name}
customer = pd.DataFrame(dict_customer)
customer

Unnamed: 0,customer_id,first_name,last_name
0,1,Emilia,Kurzidem
1,2,Oliwier,Wołek
2,3,Pola,Krawczyk
3,4,Lena,Towarek
4,5,Adrianna,Bławicki
...,...,...,...
1324,1325,Tymon,Czechowska
1325,1326,Julia,Barteczka
1326,1327,Jakub,Brzeska
1327,1328,Gabriela,Kaczanowski


Ranking of the most frequently used e-mail services: https://interaktywnie.com/biznes/artykuly/biznes/przeglad-ktora-poczta-e-mail-jest-najlepsza-16950

In [8]:
# the number of users of the given e-mail addresses
domains = {'@gmail.com': 8968134, '@wp.pl': 6895602, '@onet.pl': 4302509,
          '@interia.pl': 2677345, '@o2.pl': 2855010, '@student.pwr.edu.pl': 21902}

In [9]:
def customer_email(n, customer, domains):
    """
    A function that creates random email addresses based on a person's first and last name.
    
    Args:
        n: vector length
        customer: a table containing the person's first and last name in the columns "first_name" and "last_name"
        domains: a dictionary containing domain names and the number of users using them
        
    Returns:
        email: vector of emails
    """
    
    email = np.zeros(n).astype(str)

    for i in range(n):

        x = random.choice(range(1,3))

        if x == 1:
            nazwa = customer["first_name"][i]
            y = random.choice(range(1,5))
            if y == 1:
                nazwa += "." + customer["last_name"][i]
            elif y == 2:
                nazwa += customer["last_name"][i]
            elif y == 3:
                nazwa += "." + customer["last_name"][i][0:random.choice(range(3,5))]
            else:
                nazwa += str(random.choice(range(100,10000)))

        else:
            nazwa = customer["last_name"][i] 
            y = random.choice(range(1,5))
            if y == 1:
                nazwa += "." + customer["first_name"][i]
            elif y == 2:
                nazwa += customer["first_name"][i]
            elif y == 3:
                nazwa += "." + customer["first_name"][i][0:random.choice(range(2,5))]
            else:
                nazwa += str(random.choice(range(100,10000)))

        email[i] = nazwa
    
    domains = np.random.choice(list(domains.keys()), size=len(email), replace=True, p=normalise(list(domains.values()))) 

    for i in range(n):
        email[i] += domains[i]
    email = [unidecode(i) for i in email] 
    
    return email

In [10]:
email = customer_email(n, customer, domains)

Information about the 2 digits that start phone numbers in Poland: https://pl.wikipedia.org/wiki/Numery_telefoniczne_w_Polsce

In [11]:
def phone_number(n):
    """
    A function that creates a vector of random numbers that may occur in Poland.
    
    Args:
        n: vector length
        
    Returns:
        phone: vector of phone numbers
    """
    
    phone = np.zeros(n).astype(int)
    for i in range(n):
        number =  str(np.random.choice([45, 50, 51, 53, 57, 60, 66, 69, 72, 73, 78, 79, 88])) +\
                  str(random.choice(range(1000000, 9000000)))
        phone[i] = int(number)
    return phone

In [12]:
phone = phone_number(n)

customer["email"] = email
customer["phone"] = phone

In [13]:
# To make sure there are no same phone numbers
counter = Counter(phone)
result = [i for i, j in counter.items() if j > 1]
if result != []:
    print("Same phone numbers!!")
else:
    print("phone: Everything ok.")

phone: Everything ok.


In [14]:
customer

Unnamed: 0,customer_id,first_name,last_name,email,phone
0,1,Emilia,Kurzidem,Emilia.Kur@interia.pl,511491634
1,2,Oliwier,Wołek,Wolek.Ol@wp.pl,602370041
2,3,Pola,Krawczyk,Pola8885@o2.pl,783017148
3,4,Lena,Towarek,Towarek.Le@gmail.com,514919744
4,5,Adrianna,Bławicki,AdriannaBlawicki@gmail.com,517698695
...,...,...,...,...,...
1324,1325,Tymon,Czechowska,Czechowska7432@gmail.com,453904401
1325,1326,Julia,Barteczka,Julia.Bar@interia.pl,698455331
1326,1327,Jakub,Brzeska,Jakub.Brzeska@o2.pl,571806955
1327,1328,Gabriela,Kaczanowski,Gabriela4145@gmail.com,515851916


In [15]:
customer.to_csv("customer.csv", sep = ",", encoding='utf-8-sig', index=False)

# staff table

In [16]:
random.seed(123)
n = 17 # number of employees

In [17]:
first_name = first_and_last_name(n, fn_F, fn_M, probab=[0.35, 0.65])
last_name = first_and_last_name(n, ln_F, ln_M, probab=[0.35, 0.65])

In [18]:
staff_id = range(1,n+1)
dict_staff = {'staff_id': staff_id, "first_name": first_name, "last_name": last_name}
staff = pd.DataFrame(dict_staff)
staff

Unnamed: 0,staff_id,first_name,last_name
0,1,Maksym,Pawlik
1,2,Franciszek,Lewandowska
2,3,Hanna,Kopeć
3,4,Antoni,Muszyński
4,5,Maksymilian,Piasecka
5,6,Antoni,Preś
6,7,Leon,Bartniczuk
7,8,Izabela,Sosnowska
8,9,Franciszek,Dumała
9,10,Antoni,Indyk


In [19]:
# employees have their own @dragons.com domain and email name created in one specific way
email = np.zeros(n).astype(str)
for i in range(n):
    email[i] = staff["first_name"][i] + "." + staff["last_name"][i] + "@dragons.com"
email = [unidecode(i) for i in email] 

In [20]:
phone = phone_number(n)

staff["phone"] = phone
staff["email"] = email

amount = [4310, 5140, 6530, 7280, 8320] # gross prices
salary = np.random.choice(amount, size=n, replace=True, p=[0.35, 0.30, 0.15, 0.10, 0.1]). astype(float)
staff["salary"] = salary 
staff

Unnamed: 0,staff_id,first_name,last_name,phone,email,salary
0,1,Maksym,Pawlik,791439257,Maksym.Pawlik@dragons.com,4310.0
1,2,Franciszek,Lewandowska,723245438,Franciszek.Lewandowska@dragons.c,5140.0
2,3,Hanna,Kopeć,781731374,Hanna.Kopec@dragons.com,5140.0
3,4,Antoni,Muszyński,887450473,Antoni.Muszynski@dragons.com,8320.0
4,5,Maksymilian,Piasecka,794416191,Maksymilian.Piasecka@dragons.com,4310.0
5,6,Antoni,Preś,793236015,Antoni.Pres@dragons.com,6530.0
6,7,Leon,Bartniczuk,511903455,Leon.Bartniczuk@dragons.com,4310.0
7,8,Izabela,Sosnowska,888033877,Izabela.Sosnowska@dragons.com,8320.0
8,9,Franciszek,Dumała,458559804,Franciszek.Dumala@dragons.com,7280.0
9,10,Antoni,Indyk,888338570,Antoni.Indyk@dragons.com,5140.0


In [21]:
staff.to_csv("staff.csv", sep = ",", encoding='utf-8-sig', index=False)