In [1]:
import pandas as pd
import numpy as np
import random
import datetime
from unidecode import unidecode

In [2]:
COMPANY_ESTABLISHMENT_DATE = datetime.datetime(2022, 5, 29)

In [3]:
male_names = pd.read_csv("data/male_names.csv")
female_names = pd.read_csv("data/female_names.csv")
male_surnames = pd.read_csv("data/male_surnames.csv")
female_surnames = pd.read_csv("data/female_surnames.csv")
addresses = pd.read_excel("data/addresses.xlsx")

In [4]:
addresses

Unnamed: 0,LOKALNYID,ULICA_LOKALNYID,ULICA_NAZWA,NUMER_ADR,KOD_POCZTOWY,ETYKIETA_NAZWA_SKROCONA,GUS_TERC_DELEGATURA,RMWROC_OSIEDLE
0,pa_32869,ul_1742,ul. 3 Maja,11,52-119,3 Maja 11,Wrocław-Krzyki,Brochów
1,pa_47201,ul_1742,ul. 3 Maja,12,52-119,3 Maja 12,Wrocław-Krzyki,Brochów
2,pa_34880,ul_1742,ul. 3 Maja,13,52-119,3 Maja 13,Wrocław-Krzyki,Brochów
3,pa_51240,ul_1742,ul. 3 Maja,13d,52-119,3 Maja 13d,Wrocław-Krzyki,Brochów
4,pa_39299,ul_1742,ul. 3 Maja,1b,52-119,3 Maja 1b,Wrocław-Krzyki,Brochów
...,...,...,...,...,...,...,...,...
71801,pa_44827,ul_1454,ul. Żywopłotowa,5,51-007,Żywopłotowa 5,Wrocław-Psie Pole,Osobowice - Rędzin
71802,pa_40891,ul_1454,ul. Żywopłotowa,6,51-007,Żywopłotowa 6,Wrocław-Psie Pole,Osobowice - Rędzin
71803,pa_55989,ul_1454,ul. Żywopłotowa,7,51-007,Żywopłotowa 7,Wrocław-Psie Pole,Osobowice - Rędzin
71804,pa_36422,ul_1454,ul. Żywopłotowa,8,51-007,Żywopłotowa 8,Wrocław-Psie Pole,Osobowice - Rędzin


In [5]:
def rand_email(name, surname):
    return (unidecode(random.choice([name, name[0]])) + "." + unidecode(surname) + random.choice(["", str(np.random.geometric(0.5))]) + random.choice(["@wp.pl", "@gmail.com", "@onet.pl"])).lower()

In [6]:
def rand_females(n):
    names = random.choices(female_names["IMIĘ PIERWSZE"], weights = female_names["LICZBA WYSTĄPIEŃ"], k = n)
    surnames = random.choices(female_surnames["Nazwisko aktualne"], weights = female_surnames["Liczba"], k = n)
    emails = [rand_email(name, surname) for name, surname in zip(names, surnames)]
    return [name + " " + surname + " " + email for name, surname, email in zip(names, surnames, emails)]

In [7]:
def rand_males(n):
    names = random.choices(male_names["IMIĘ PIERWSZE"], weights = male_names["LICZBA WYSTĄPIEŃ"], k = n)
    surnames = random.choices(male_surnames["Nazwisko aktualne"], weights = male_surnames["Liczba"], k = n)
    emails = [rand_email(name, surname) for name, surname in zip(names, surnames)]
    return [name + " " + surname + " " + email for name, surname, email in zip(names, surnames, emails)]

In [8]:
def rand_people(n):
    female_number = int(np.random.normal(0.5 * n, 0.02 * n))
    people = rand_females(female_number) + rand_males(n - female_number)
    random.shuffle(people)
    return people

In [9]:
def rand_phone_numbers(n):
    return ["+48" + random.choice(["5", "6", "7", "8"]) + str(random.randint(10**7, 10**8-1)) for I in range(n)]

In [10]:
def rand_date(start = COMPANY_ESTABLISHMENT_DATE, stop = datetime.datetime.now()):
    deltadays = (stop - start).days
    return start + random.randint(0, deltadays) * datetime.timedelta(days = 1)

In [11]:
def rand_dates_and_salaries(n):
    # TODO sprawdzić, czy w każdym momencie jest chociaż jeden pracownik
    result = []
    for i in range(n):
        empl_date = rand_date()
        dism_date = random.choice([None, rand_date(start = empl_date)])
        salary = None if dism_date else (5500 if datetime.datetime.now() - empl_date > datetime.timedelta(weeks = 26) else (4500 if not dism_date else 0))
        result.append((empl_date, dism_date, salary))
    return result

In [12]:
def rand_address():
    global addresses
    x = random.randint(0, len(addresses))
    address = addresses.loc[x, ["ULICA_NAZWA", "NUMER_ADR", "KOD_POCZTOWY"]]
    addresses = addresses.drop(x).reset_index(drop = True)
    return address[0][4:], address[1], "Wrocław", address[2]

In [30]:
def generate_customers(n, addresses_number):
    df = pd.DataFrame()
    df["Customer_ID"] = range(1, n+1)
    df[["First_name", "Last_name", "Email"]] = pd.Series(rand_people(n)).str.split(" ", expand = True)
    df["Phone_number"] = rand_phone_numbers(n)
    df["Address_ID"] = [random.randint(0, addresses_number - 1) for _ in range(n)]
    df = df.set_index("Customer_ID")
    return df

In [31]:
def generate_employees(n, addresses_number):
    df = pd.DataFrame()
    df["Employee_ID"] = range(1, n+1)
    df[["First_name", "Last_name", "Email"]] = pd.Series(rand_people(n)).str.split(" ", expand = True)
    df["Phone_number"] = rand_phone_numbers(n)
    df["Address_ID"] = [random.randint(0, addresses_number - 1) for _ in range(n)]
    df[["Employment_date", "Dismissal_date", "Salary"]] = rand_dates_and_salaries(n)
    df = df.set_index("Employee_ID")
    return df

In [32]:
def generate_addresses(n):
    df = pd.DataFrame()
    df["Address_ID"] = range(1, n+1)
    df[["Street", "Street_number", "City", "Postal_code"]] = [rand_address() for _ in range(n)]
    df = df.set_index("Address_ID")
    return df

In [33]:
def main(addresses_number, customers_number, employees_number):
    Address = generate_addresses(addresses_number)
    Customer = generate_customers(customers_number, addresses_number)
    Employee = generate_employees(employees_number, addresses_number)
    return Address, Customer, Employee

In [34]:
address_df, customer_df, employee_df = main(103, 100, 3)

In [35]:
address_df

Unnamed: 0_level_0,Street,Street_number,City,Postal_code
Address_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Kwiska,102,Wrocław,54-210
2,Karmelkowa,10,Wrocław,52-436
3,Wilkaska,16,Wrocław,54-104
4,Lwowska,13,Wrocław,53-515
5,Średzka,8c,Wrocław,54-017
...,...,...,...,...
99,Gliniana,89c,Wrocław,50-526
100,Ruciana,20a,Wrocław,51-253
101,Lucjana Siemieńskiego,4a,Wrocław,50-228
102,Pejzażowa,56,Wrocław,54-007


In [36]:
customer_df

Unnamed: 0_level_0,First_name,Last_name,Email,Phone_number,Address_ID
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,STEFANIA,ŚCIUBA,stefania.sciuba9@gmail.com,+48631944447,13
2,EDYTA,KUBIK,edyta.kubik1@onet.pl,+48621892088,26
3,OLIWIA,FRYDRYCKA,oliwia.frydrycka@wp.pl,+48719425776,55
4,AGNIESZKA,DUŁAK,a.dulak2@onet.pl,+48527292126,100
5,EUGENIA,ŻYWOT,eugenia.zywot1@gmail.com,+48845554233,63
...,...,...,...,...,...
96,BERKE,CZEMKO,berke.czemko@gmail.com,+48731536975,47
97,MIROSŁAWA,MALINOWSKA,m.malinowska1@gmail.com,+48884688730,11
98,GRAŻYNA,SAJDA,grazyna.sajda@onet.pl,+48674408559,79
99,BEATA,KIERSZK,b.kierszk1@onet.pl,+48819324736,55


In [37]:
employee_df

Unnamed: 0_level_0,First_name,Last_name,Email,Phone_number,Address_ID,Employment_date,Dismissal_date,Salary
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,FILIP,WALCZYK,filip.walczyk@gmail.com,48871929592,99,2022-11-11,2023-03-24,
2,STEFAN,KAWECKI,stefan.kawecki@onet.pl,48654251949,58,2023-03-10,NaT,4500.0
3,JANINA,LIGĘZA,janina.ligeza@gmail.com,48878757911,12,2023-02-16,NaT,4500.0
