In [330]:
import datetime
import random
from itertools import chain

import numpy as np
import pandas as pd
from faker import Faker

In [331]:
new_games_df = pd.read_csv("data/games.csv", encoding= 'unicode_escape', sep = ";")

In [332]:
games_df = new_games_df[[ "details.name","details.yearpublished", 'details.playingtime', 'details.minage',  "details.minplayers", "details.maxplayers" , "details.description", "game.type"]]
games_df.rename(columns = {"details.name":"Name", "details.yearpublished": "Year Published", 'details.playingtime':"Playing Time", 'details.minage': "Min Age", "details.minplayers": "Min Players", "details.maxplayers": "Max Players", "details.description":"Description", "game.type":"Type"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [333]:
games_df.head()

Unnamed: 0,Name,Year Published,Playing Time,Min Age,Min Players,Max Players,Description,Type
0,Die Macher,1986.0,240.0,14.0,3.0,5.0,Die Macher is a game about seven sequential po...,boardgame
1,Dragonmaster,1981.0,30.0,12.0,3.0,4.0,Dragonmaster is a trick-taking card game based...,boardgame
2,Samurai,1998.0,60.0,10.0,2.0,4.0,"Part of the Knizia tile-laying trilogy, Samura...",boardgame
3,Tal der Könige,1992.0,60.0,12.0,2.0,4.0,When you see the triangular box and the luxuri...,boardgame
4,Acquire,1964.0,90.0,12.0,3.0,6.0,"In Acquire, each player strategically invests ...",boardgame


### CONSTANTS

In [334]:
GAMES_AMOUNT = 100
CUSTOMERS_AMOUNT = 1000
EMPLOYEES_AMOUNT = 4
YEAR = 2022
PRICE_FOR_DAY = 10

### GAMES

In [335]:
games_tbl = games_df.sample(n = GAMES_AMOUNT)
games_tbl.insert(0, "game_id", np.arange(1, GAMES_AMOUNT+1))
games_tbl["Price"] = np.round(np.random.uniform(100, 200, 100)) + 0.99
games_tbl.head()

Unnamed: 0,game_id,Name,Year Published,Playing Time,Min Age,Min Players,Max Players,Description,Type,Price
5901,1,Cheat,0.0,10.0,8.0,3.0,10.0,Cheat is a standard deck playing card game; it...,boardgame,190.99
12115,2,Raid the Pantry,2012.0,30.0,8.0,2.0,4.0,In the cooking-themed card game Raid the Pantr...,boardgame,184.99
14642,3,The Hobbit: Enchanted Gold,2014.0,30.0,8.0,2.0,2.0,"Bilbo and the dwarves set out to slay Smaug, b...",boardgame,150.99
7950,4,First Blood: Second Marne,2008.0,0.0,0.0,2.0,2.0,"First Blood: Second Marne, 15 July 1918 (FB), ...",boardgame,187.99
9040,5,Nur die Ziege zählt,2009.0,30.0,8.0,3.0,6.0,"Each round, every player gets a hand of 8 card...",boardgame,120.99


### CUSTOMERS

In [336]:
eng_first_names_df = pd.read_csv("data/english_first_names.csv").sort_values(by = ["Rank"])
eng_last_names_df = pd.read_csv("data/english_last_names.csv", sep=";")
pl_first_names_w_df = pd.read_csv("data/polish_female_names.csv")
pl_first_names_m_df = pd.read_csv("data/polish_male_names.csv")
pl_last_names_w_df = pd.read_csv("data/polish_female_last_names.csv")
pl_last_names_m_df = pd.read_csv("data/polish_male_last_names.csv")

In [337]:
# most popular names and surnames
eng_first_names = eng_first_names_df["Child's First Name"][0:2000]
eng_last_names = eng_last_names_df["SURNAME"] # only 1000 most popular
pl_first_names_w = pl_first_names_w_df["IMIĘ PIERWSZE"][0:100]
pl_first_names_m = pl_first_names_m_df["IMIĘ PIERWSZE"][0:100]
pl_last_names_w= pl_last_names_w_df["Nazwisko aktualne"][0:500]
pl_last_names_m = pl_last_names_m_df["Nazwisko aktualne"][0:500]
customers_tbl = pd.DataFrame()

In [338]:
proportions = np.array([0.4, 0.25, 0.35]) # ALL_ENG, W_PL, M_PL
numbers = proportions * CUSTOMERS_AMOUNT
phone_numbers = random.sample(range(100000000, 999999999), CUSTOMERS_AMOUNT + EMPLOYEES_AMOUNT)

In [339]:
customers_tbl["customer_id"] = np.arange( 1, CUSTOMERS_AMOUNT+1)
customers_tbl["first_name"]= np.concatenate([np.random.choice(eng_first_names,int( numbers[0])), np.random.choice(pl_first_names_w,int( numbers[1])), np.random.choice(pl_first_names_m,int( numbers[2]))])
customers_tbl["last_name"]= np.concatenate([np.random.choice(eng_last_names,int( numbers[0])), np.random.choice(pl_last_names_w,int( numbers[1])), np.random.choice(pl_last_names_m,int( numbers[2]))])
customers_tbl["first_name"] = customers_tbl["first_name"].apply(str.capitalize)
customers_tbl["last_name"] = customers_tbl["last_name"].apply(str.capitalize)

In [340]:
fake = Faker()
birth_dates = [fake.date_between(start_date = "-50y", end_date = "-20y").strftime("%d.%m.%Y") for _ in range(CUSTOMERS_AMOUNT)]
customers_tbl["birth_date"] = birth_dates

In [341]:
def generate_email(row):
    return row["first_name"].lower()+"."+ row["last_name"].lower() + "@mail.com"

In [342]:
customers_tbl["email"] = customers_tbl.apply(lambda row: generate_email(row), axis=1)

In [343]:
customers_tbl["phone_number"] = phone_numbers[0:CUSTOMERS_AMOUNT]

In [344]:
customers_tbl.head()

Unnamed: 0,customer_id,first_name,last_name,birth_date,email,phone_number
0,1,Angela,Crane,09.04.1975,angela.crane@mail.com,268637880
1,2,Aaliyah,Montgomery,05.06.1975,aaliyah.montgomery@mail.com,864325441
2,3,Benjamin,Fitzpatrick,15.08.1979,benjamin.fitzpatrick@mail.com,274816100
3,4,Josiah,Newton,05.01.2001,josiah.newton@mail.com,535630577
4,5,Ava,Callahan,17.06.1974,ava.callahan@mail.com,480360683


## Employees

In [345]:
employees_tbl = pd.DataFrame()
women_amount = random.randint(1, EMPLOYEES_AMOUNT)
men_amount = EMPLOYEES_AMOUNT - women_amount
employees_tbl["employee_id"] = np.arange( 1, EMPLOYEES_AMOUNT+1)
employees_tbl["first_name"] = np.concatenate( [np.random.choice(pl_first_names_w,int(women_amount)), np.random.choice(pl_first_names_m,int( men_amount ))])
employees_tbl["last_name"] = np.concatenate( [np.random.choice(pl_last_names_w,int(women_amount)), np.random.choice(pl_last_names_m,int( men_amount))])
employees_tbl["email"] = employees_tbl.apply(lambda row: generate_email(row), axis=1)
employees_tbl["phone_number"] = phone_numbers[-EMPLOYEES_AMOUNT-1: -1]
employees_tbl["birth_date"] = [fake.date_between(start_date = "-50y", end_date = "-20y").strftime("%d.%m.%Y") for _ in range(EMPLOYEES_AMOUNT)]
employees_tbl["start_work_date"] = [fake.date_between(start_date = "-80y", end_date = "-2y").strftime("%d.%m.%Y") for _ in range(EMPLOYEES_AMOUNT)]

In [346]:
employees_tbl

Unnamed: 0,employee_id,first_name,last_name,email,phone_number,birth_date,start_work_date
0,1,ŁUCJA,MICHALSKA,łucja.michalska@mail.com,616309335,12.11.1995,28.07.1953
1,2,BOGUMIŁA,MIKULSKA,bogumiła.mikulska@mail.com,413169893,28.01.1975,25.09.2009
2,3,KRYSTIAN,ROMANOWSKI,krystian.romanowski@mail.com,101808523,07.01.1976,22.12.1946
3,4,RYSZARD,STEC,ryszard.stec@mail.com,928068028,28.12.1975,06.02.2020


### Payoffs

Komunistycznie na razie wszyscy dostają tyle samo XD

In [347]:
payoff_tbl = pd.DataFrame()

## Turnieje

### Terminarz/ termines

Przy założeniu że jeden raz w miesiącu (np w pierwszy czwartek miesiąca)

In [348]:
termines_tbl = pd.DataFrame()
termines_tbl["tournament_id"] = range(1, 13)
first_thursdays = [(datetime.date(YEAR, month, 1) + datetime.timedelta(days=((4 - datetime.date(YEAR, month, 1).weekday()) % 7))).strftime("%d.%m.%y") for month in range(1, 13)]
termines_tbl["date"] = first_thursdays

In [349]:
termines_tbl.head()

Unnamed: 0,tournament_id,date
0,1,07.01.22
1,2,04.02.22
2,3,04.03.22
3,4,01.04.22
4,5,06.05.22


### Turnieje

Z gier które zostały wyznaczone dla sklepu losuję 5 gier, które będą grami turniejowymi. (typ = board game, max graczy >= 4)

In [358]:
#games_tbl[( games_tbl["Type"] == "boardgame") & ( games_tbl["Max Players"] >= 4) ]["game_id"].to_list()
tournament_games  = random.sample( games_tbl[( games_tbl["Type"] == "boardgame") & ( games_tbl["Max Players"] >= 4) ]["game_id"].to_list(), 5)

In [359]:
tournaments_tbl = pd.DataFrame()
tournaments_tbl["tournament_id"] = range(1, 13)
tournaments_tbl["game_id"] = random.choices(tournament_games, k = 12)
tournaments_tbl["max_players"] = 16 * 4
tournaments_tbl["entry_fee"] = 20
tournaments_tbl["prize"] = 150

In [360]:
tournaments_tbl.head()

Unnamed: 0,tournament_id,game_id,max_players,entry_fee,prize
0,1,74,64,20,150
1,2,60,64,20,150
2,3,98,64,20,150
3,4,65,64,20,150
4,5,60,64,20,150


### Wyniki

In [362]:
results_tbl = pd.DataFrame()
tournament_ids = [64 * [i] for i in tournaments_tbl["tournament_id"]]
results_tbl["tournament_id"] = [i for i in chain.from_iterable(tournament_ids)]
results_tbl["position"] = [place for place  in range(1,65)] * 12
results = [random.sample(customers_tbl["customer_id"].to_list(), k = 64) for _ in range(1, 13)]
results_tbl["customer_id"] = [i for i in chain.from_iterable(results)]
results_tbl.insert(0,"result_id", range(1,len(results_tbl)+1))
results_tbl.head()

Unnamed: 0,result_id,tournament_id,position,customer_id
0,1,1,1,248
1,2,1,2,335
2,3,1,3,343
3,4,1,4,840
4,5,1,5,150


### Rentals

In [363]:
def generate_list_with_occurrences(numbers, occurrences):
    result = [str(number )for number, occurrence in zip(numbers, occurrences) for _ in range(occurrence)]
    return result

def generate_date_from_day_number(row):
    return datetime.datetime.strptime(str(YEAR) + "-" + row["day_num"], "%Y-%j").strftime("%d.%m.%Y")

def generate_return_date(row):
    rental_date = datetime.datetime.strptime(row["rental_date"], "%d.%m.%Y")
    return_date = rental_date  + datetime.timedelta(days = np.random.poisson(2) + 1 )

    return  return_date.strftime("%d.%m.%Y") if return_date.year == rental_date.year else None

def rental_duration(row):
    try:
        rental_date = datetime.datetime.strptime(row["rental_date"], "%d.%m.%Y")
        return_date = datetime.datetime.strptime(row["return_date"], "%d.%m.%Y")
    except TypeError:
        return None
    return ( return_date - rental_date).days 

In [386]:
rentals_tbl = pd.DataFrame()
day_of_year = np.arange(1, pd.Timestamp(YEAR, 12, 31).dayofyear + 1)
rent_daily = [np.random.poisson(3) for _ in range(pd.Timestamp(YEAR, 12, 31).dayofyear)]
rentals_tbl["customer_id"] =  random.choices(customers_tbl["customer_id"].to_list(), k = np.sum(rent_daily) )
rentals_tbl["game_id"] = random.choices(range(1,GAMES_AMOUNT+1), k = np.sum(rent_daily) )
rentals_tbl["day_num"]= generate_list_with_occurrences(day_of_year, rent_daily)
rentals_tbl["rental_date"] = rentals_tbl.apply(lambda row: generate_date_from_day_number(row), axis=1)
rentals_tbl["return_date"] = rentals_tbl.apply(lambda row: generate_return_date(row), axis=1)
rentals_tbl = rentals_tbl.drop("day_num", axis = 1)
rentals_tbl.insert(0, "rental_id", np.arange(1, rentals_tbl.shape[0]+1))

In [387]:
rentals_tbl

Unnamed: 0,rental_id,customer_id,game_id,rental_date,return_date
0,1,330,19,01.01.2022,04.01.2022
1,2,77,94,01.01.2022,05.01.2022
2,3,787,10,01.01.2022,04.01.2022
3,4,679,8,01.01.2022,05.01.2022
4,5,50,53,01.01.2022,05.01.2022
...,...,...,...,...,...
1134,1135,253,88,30.12.2022,
1135,1136,703,99,30.12.2022,
1136,1137,441,58,30.12.2022,
1137,1138,785,49,31.12.2022,


In [372]:
rentals_tbl["return_date"] = rentals_tbl.apply(lambda row: generate_return_date(row), axis=1)

In [373]:
rentals_tbl["duration"] = rentals_tbl.apply(lambda row: rental_duration(row), axis=1)


In [374]:
def check_back_to_back(list):
    return any(list[i] == list[i+1]  == True for i in range(len(list) - 1))

def how_many_Trues_in_row(list):
    max_true_count = 0
    current_true_count = 0
    for value in list:
        if value:
            current_true_count += 1
        else:
            max_true_count = max(max_true_count, current_true_count)
            current_true_count = 0
    return max(max_true_count, current_true_count)

In [388]:
inventory_amount = []
for game_id in range(1, GAMES_AMOUNT+1):
    temp_df = rentals_tbl[rentals_tbl["game_id"] == game_id]
    rental_dates = pd.to_datetime(temp_df['rental_date'])[1:].to_list()
    return_dates = pd.to_datetime(temp_df['return_date'])[:-1].to_list()
    dates_overlap = [rental_dates[i] < return_dates[i] for i in range(len(rental_dates))]
    inv  = how_many_Trues_in_row(dates_overlap)
    inventory_amount.append(inv + 2)

game_amount_dict = dict(zip(range(1, GAMES_AMOUNT+1), inventory_amount))

In [390]:
inventory_rent_tbl = pd.DataFrame()
inventory_rent_tbl["game_id"] = generate_list_with_occurrences(range(1, GAMES_AMOUNT+1), inventory_amount)
inventory_rent_tbl["inventory_id"] = np.arange(1, inventory_rent_tbl.shape[0] + 1)
inventory_rent_tbl["type"] = np.repeat("R", inventory_rent_tbl.shape[0])

In [392]:
inventory_rent_tbl.head()

Unnamed: 0,game_id,inventory_id,type
0,1,1,R
1,1,2,R
2,1,3,R
3,1,4,R
4,1,5,R


### Sales

In [403]:
sales_tbl = pd.DataFrame()
day_of_year = np.arange(1, pd.Timestamp(YEAR, 12, 31).dayofyear + 1)
sold_daily = [np.random.poisson(4) for _ in range(pd.Timestamp(YEAR, 12, 31).dayofyear)]
sales_tbl["customer_id"] =  random.choices(customers_tbl["customer_id"].to_list(), k = np.sum(sold_daily) )
sales_tbl["game_id"] = random.choices(range(1,GAMES_AMOUNT+1), k = np.sum(sold_daily) )
sales_tbl["day_num"]= generate_list_with_occurrences(day_of_year, sold_daily)
sales_tbl["date"] = sales_tbl.apply(lambda row: generate_date_from_day_number(row), axis=1)
sales_tbl = sales_tbl.drop("day_num", axis = 1)

In [404]:
sales_tbl

Unnamed: 0,customer_id,game_id,day_num,date
0,970,28,1,01.01.2022
1,184,23,1,01.01.2022
2,949,74,1,01.01.2022
3,226,28,2,02.01.2022
4,351,81,2,02.01.2022
...,...,...,...,...
1455,375,73,364,30.12.2022
1456,568,50,364,30.12.2022
1457,906,71,364,30.12.2022
1458,331,80,365,31.12.2022
