In [114]:
import numpy as np
import pandas as pd

In [115]:
# Rooms.csv 
# PRIMARY key: nominal data
sixth_floor = np.arange(101, 120)
fifth_floor = np.arange(201, 220)
forth_floor = np.arange(301, 320)
third_floor = np.arange(401, 420)
second_floor = np.arange(501, 520)
first_floor = np.arange(601, 620)
ground_floor = np.arange(701, 720)
lower_floor = np.arange(801, 820)

# room_numbers = np.array([lower_floor, ground_floor, first_floor, second_floor, third_floor, forth_floor, fifth_floor, sixth_floor])
room_numbers = np.concatenate([lower_floor, ground_floor, first_floor, second_floor, third_floor, forth_floor, fifth_floor, sixth_floor])

# Nominal data
room_type = ['Single', 'Double', 'Suite']
room_type = np.random.choice(room_type, room_numbers.size)

# Ordinal data

room_qualities= ['Standard', 'Superior Room', 'Deluxe Room', 'Executive Room', 'Junior Suite', 'Suite', 'Penthouse Suite']
room_type_level = []

for room in room_numbers:
    if 501 <= room <= 820:
        # Randomly select from 'Standard' to 'Deluxe Room' for 5th to lower floors
        quality = np.random.choice(room_qualities[0:3])
    elif 301 <= room <= 420:
        # Randomly select from 'Executive Room' to 'Junior Suite' for 3rd and 4th floors
        quality = np.random.choice(room_qualities[3:5])
    else:
        # Randomly select from 'Suite' to 'Penthouse Suite' for 1st, 2nd, ground, and lower ground floors
        quality = np.random.choice(room_qualities[5:7])
    room_type_level.append(quality)

# Ratio data
price_per_night = []
for room in room_type_level:
    if room == 'Standard':
        price = 400
    elif room == 'Superior Room':
        price = 600
    elif room == 'Deluxe Room':
        price = 800
    elif room == 'Executive Room':
        price = 1000
    elif room == 'Junior Suite':
        price = 1200
    elif room == 'Suite':
        price = 1500
    else:
        price = 1900
    price_per_night.append(price)

rooms = pd.DataFrame({
    'RoomNumber': room_numbers,
    'RoomType': room_type,
    'RoomTypeLevel': room_type_level,
    'PricePerNight': price_per_night
})
rooms.to_csv('Rooms.csv', index=False)

rooms.head(20)

Unnamed: 0,RoomNumber,RoomType,RoomTypeLevel,PricePerNight
0,801,Suite,Superior Room,600
1,802,Suite,Superior Room,600
2,803,Double,Deluxe Room,800
3,804,Double,Deluxe Room,800
4,805,Single,Deluxe Room,800
5,806,Single,Deluxe Room,800
6,807,Single,Standard,400
7,808,Suite,Deluxe Room,800
8,809,Suite,Superior Room,600
9,810,Suite,Standard,400


In [116]:
# Guest.csv
n_guests = 1000
# PRIMARY KEY: Nominal data
guest_id = [0,]
while len(guest_id) < n_guests:
    guest_id = np.unique(np.random.randint(202400000, 202600000, n_guests))

# Nominal data
name = np.loadtxt('random-names/random_names.csv', delimiter=',', unpack=True, dtype=str)
name = name[1:]

# Nominal data
last_random_10_numbers = np.random.randint(7300000000, 7400000000, n_guests, dtype=np.int64)
contact_number = [str(last_random_10_numbers[i]).zfill(11) for i in range(n_guests)]

guests = pd.DataFrame({
    'GuestID': guest_id,
    'Name': name,
    'ContactNumber': contact_number
})

guests.head(20)

Unnamed: 0,GuestID,Name,ContactNumber
0,202400190,Ellia Baker,7307968020
1,202400449,Edwin Hawkins,7314397841
2,202400457,Madaline Cunningham,7360327548
3,202400570,Samantha Hill,7347193401
4,202400636,Julia Bennett,7305166465
5,202400647,Eddy Stevens,7389252051
6,202400704,Hailey Stewart,7363036298
7,202400949,Michelle Richards,7378283923
8,202401134,Rubie Wright,7323680848
9,202401284,James Stevens,7332454606


In [117]:
# Booking.csv
n_booking = 1000
# PRIMARY KEY: nominal
booking_id = [0,]
while len(booking_id) < n_booking:
    booking_id = np.unique(np.random.randint(69000000, 73000000, n_booking))

# FOREIGN KEY: Nominal 
#guest_id
# room_numbers
room_numbers_booking = np.random.choice(rooms['RoomNumber'], n_booking)

# Interval data

# checkin
checkin_year = np.random.randint(2024, 2026, n_booking)
checkin_month = np.random.randint(1,13,n_booking)
checkin_day = np.random.randint(1,29,n_booking)

checkin_date = [f'{year[i]}-{str(checkin_month[i]).zfill(2)}-{str(checkin_day[i]).zfill(2)}' for i in range(n_booking)]


#checkout
checkout_date = []
for i in range(n_booking):
    checkin_y = checkin_year[i]
    checkin_m = checkin_month[i]
    checkin_d = checkin_day[i]

    # Adding 1 to 14 days to the check-in date for checkout
    additional_days = np.random.randint(1, 15)

    # Calculate checkout day, adjusting for month and year boundaries
    checkout_d = checkin_d + additional_days
    checkout_m = checkin_m
    checkout_y = checkin_y

    # Handling overflow of days beyond the current month
    if checkout_d > 28:
        checkout_d -= 28
        checkout_m += 1

    # Handling overflow of months beyond the year
    if checkout_m > 12:
        checkout_m -= 12
        checkout_y += 1

    # Format the checkout date
    checkout_date_str = f'{checkout_y}-{str(checkout_m).zfill(2)}-{str(checkout_d).zfill(2)}'
    checkout_date.append(checkout_date_str)
bookings = pd.DataFrame({
    'BookingID': booking_id,
    'GuestID': guest_id,
    'RoomNumber': room_numbers_booking,
    'CheckInDate': checkin_date,
    'CheckOutDate': checkout_date
})

In [118]:
bookings.head()

Unnamed: 0,BookingID,GuestID,RoomNumber,CheckInDate,CheckOutDate
0,69001014,202400190,308,2024-07-17,2025-07-21
1,69004150,202400449,815,2025-02-04,2024-02-14
2,69012430,202400457,803,2024-12-07,2024-12-18
3,69028199,202400570,518,2024-02-11,2024-02-23
4,69029791,202400636,817,2024-01-19,2024-02-04


In [119]:
# Insert missing values
n = 20

random_indices = np.random.choice(bookings.index, n, replace=False)

bookings.loc[random_indices, 'CheckOutDate'] = np.nan

In [120]:
bookings.isnull().sum()

BookingID        0
GuestID          0
RoomNumber       0
CheckInDate      0
CheckOutDate    20
dtype: int64

In [165]:
bookings.to_csv('Bookings.csv', index=False)

In [157]:
# Insert duplicate values
n = 5

selected_indices = guests.sample(n=n).index

guest_duplicates = pd.concat([guests, guests.loc[selected_indices]], ignore_index=True)

# Shuffle the rows
guest_duplicates = guest_duplicates.sample(frac=1, random_state=42).reset_index(drop=True)

# Print information
print(f"Random Indices: {selected_indices}")
print(f"Shape with duplicates: {guest_duplicates.shape}")

guest_duplicates.tail()

Random Indices: Index([156, 19, 873, 273, 331], dtype='int64')
Shape with duplicates: (1005, 3)


Unnamed: 0,GuestID,Name,ContactNumber
1000,202419972,Rebecca Kelley,7382515642
1001,202455834,Adelaide Kelley,7312046883
1002,202572120,Maddie Dixon,7329228453
1003,202488832,Valeria Jones,7310219980
1004,202418320,Walter Owens,7304700196


In [163]:
guest_duplicates.to_csv('Guests.csv', index=False)