<a href="https://colab.research.google.com/github/lukaszplust/Projects/blob/main/moje_sbd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import os
import itertools
import math
from random import randint

In [127]:
# BUFFER_SIZE - określa liczbę rekordów, które będą przechowywane w buforze w jednym momencie.
# Buforowanie poprawia wydajność operacji wejścia-wyjścia, minimalizując liczbę operacji odczytu i zapisu na dysk poprzez grupowanie ich w większe bloki.

# zakładam, ze blok to 512 bajtów, wiec wychodzi po 32 rekordy
BUFFER_SIZE = 32
#dlaczego rekord to 16 bitów?
# to chyba dlatego co jest nizej

# TO NADANE PRZEZ POLECENIE
# SET_BYTES_SIZE - określa rozmiar w bajtach pojedynczego rekordu bez dodatkowych znaków.
SET_BYTES_SIZE = 15

# Ta stała określa rozmiar w bajtach pojedynczego rekordu, włączając dodatkowy znak (nowa linia lub znak końca rekordu).
# Zazwyczaj jest to znak '\n' (nowa linia), który oddziela rekordy w pliku tekstowym
RECORD_BYTES_SIZE = SET_BYTES_SIZE + 1

# Ta stała określa całkowity rozmiar bufora w bajtach.
# Jest to iloczyn liczby rekordów w buforze (BUFFER_SIZE) i rozmiaru jednego rekordu w bajtach (RECORD_BYTES_SIZE).
# Bufor o takim rozmiarze będzie używany do operacji odczytu i zapisu blokowego, co zwiększa efektywność przez minimalizację liczby operacji I/O
BYTES_BUFFER_SIZE = BUFFER_SIZE * RECORD_BYTES_SIZE

In [186]:
class Record:

  def __init__(self, elements):
    self.elements = elements

  def __repr__(self):
    return f"Set {sorted(self.elements, reverse=True)}"

  @staticmethod
  def load_from_ints(ints):
    return Record(ints[1: ints[0] + 1])


  def save_to_ints(self):
    result = [len(self.elements)] + self.elements
    return result + [0] * (RECORD_BYTES_SIZE - len(result))

  def __lt__(self, other):
    if other is None:
        return True

    self_elements_copy = self.elements[:]
    other_elements_copy = other.elements[:]

    for elem in self.elements_copy:
      if elem in other_elements_copy:
        self_elements_copy.remove(elem)
        other_elements_copy.remove(elem)


        # Jeśli po usunięciu wspólnych elementów lista elementów drugiego obiektu (other_items_copy) jest pusta, oznacza to,
        # że wszystkie jego elementy były wspólne z obecnym obiektem, więc obecny obiekt (self) nie jest mniejszy od drugiego obiektu (False).
    if len(other_elements_copy) == 0:
      return False

        # Jeśli lista elementów obecnego obiektu (self_items_copy) jest pusta, oznacza to,
        # że wszystkie jego elementy były wspólne z drugim obiektem, więc obecny obiekt (self) jest mniejszy od drugiego obiektu (True)
    elif len(self_elements_copy) == 0:
      return True

        # Jeśli obie listy zawierają jeszcze jakieś elementy, metoda porównuje największe pozostałe elementy z obu list
        # (s_max dla obecnego obiektu i o_max dla drugiego obiektu).
    s_max = max(self_elements_copy)
    o_max = max(other_elements_copy)

    return o_max > s_max

In [187]:
class ReadBuffer:

  def __init__(self, path):

    self.path = path
    self.read_possition = 0
    self.file_possition = 0
    self.size = BUFFER_SIZE
    self.file_size = os.path.getsize(path)
    # liczba rekordów obecnie załadowanych do bufora
    self.loaded_size = 0
    self.buffer = []
    self.disk_reads_count = 0
    self.load_next()
  def __iter__(self):
    return self

  def __next__(self):
    if (next_record := self.read_next()) is None:
        raise StopIteration
    return next_record

  def check_more(self):
    # sprawdzam czy pozycja odczytu w buforze jest mniejsza niz liczba rekordów obecnie załadowanych do bufora
    # lub
    # pozycja odczytu w buforze jest mniejsza niż całkowity rozmiar pliku
    return (self.read_possition < self.loaded_size or self.file_possition < self.file_size)

  def see_next(self):
    # pozycja odczytu w pliku = liczba rekordów obecnie załadowanych do bufora
    if self.read_possition == self.loaded_size:
        return None
    return self.buffer[self.read_possition]

  # ODCZYT REKORDÓW: Metoda read_next zwraca następny rekord
  def read_next(self):
    # sprawdzam, czy są jeszcze rekordy do odczytu
    if not self.check_more():
        return None

    # jeśli są jakieś rekordy do odczytu to zwracam następny rekord z bufora
    record = self.buffer[self.read_possition]
    self.read_possition += 1

    # jeśli pozycja odczytu osiągnie rozmiar bufora
    if self.read_possition == self.size:
        # ładuje kolejną porcję danych do bufora (self.load_next())
        self.load_next()
        # resetuje self.read_possition
        self.read_possition = 0

    return record

  def load_next(self):
    self.buffer = []
    with open(self.path, "rb", buffering =0) as f:
      # seek() function is used to change the position of
      # the File Handle to a given specific position
      # przesuwam wskaźnik odczytu pliku do self.file_pos
      f.seek(self.file_possition)

      bytes_to_read = min(BYTES_BUFFER_SIZE, self.file_size - self.file_possition)

      temporrary_buffer = f.read(bytes_to_read)

      if len(temporrary_buffer) % RECORD_BYTES_SIZE != 0:
        raise Exception("Read bytes are not multiply of record size")

      self.file_possition += bytes_to_read
      self.loaded_size = bytes_to_read / RECORD_BYTES_SIZE
      temporrary_ints = list(temporrary_buffer)

      # dodaje liste rekordów do bufora (self.buffer)
      for i in range(len(temporrary_buffer) // RECORD_BYTES_SIZE):
          record_ints = temporrary_ints[
                        RECORD_BYTES_SIZE * i:RECORD_BYTES_SIZE * (i + 1)
                        ]
          self.buffer.append(Record.load_from_ints(record_ints))

      # zamykam plik i zwiększam licznik operacji odczytu z dysku (self.disk_reads_count)
      f.close()
      self.disk_reads_count += 1


In [188]:
class WriteBuffer:

  def __init__(self, path, append_mode=False):

    self.possition = 0
    self.buffer = [None] * BUFFER_SIZE

    self.path = path
    self.size = BUFFER_SIZE

    self.series_written = 0
    self.last_written = None
    self.disk_writes_count = 0


  def save_next(self):
    ints_to_write = []

    for record in self.buffer[0:self.possition]:
      ints_to_write += record.save_to_ints()

    with open(self.path,"ab", buffering = 0) as f:
      f.write(bytearray(ints_to_write))
      f.close()
      self.disk_writes_count += 1


  def flush(self):
    if self.possition > 0:
      self.save_next()
      self.possition = 0

  def write_next(self, record):

    if record < self.last_written:
      self.series_written += 1

    if self.possition == self.size:
      self.flush()

    self.buffer[self.possition] = record
    self.possition += 1
    self.last_written = record

In [189]:
class Iterator:

  def __init__(self, read_buffer):
    self.read_buffer = read_buffer
    self.current_record = None
    self.end_of_series = False

  def __iter__(self):
    return self

  def __next__(self):
    record = self.read_next()
    if record is None:
      raise StopIteration
    return record

  def read_next(self):
    if self.end_of_series:
      return None

    self.current_record = self.read_buffer.read_next()
    if self.current_record is None:
      return None

    next_record = self.read_buffer.see_next()

    if next_record is not None and next_record < self.current_record:
      self.end_of_series = True

    return self.current_record

In [190]:
def print_tape(name):
  print(f'Tape: {name}\n')

  buffer = ReadBuffer(name)
  series_counter = 0
  records_counter = 0

  while buffer.check_more():
    iterator = Iterator(buffer)
    for r in iterator:
      print(r)
      records_counter +=1
    series_counter +=1
    print("---- series end ----")

  print(f'Series count: {series_counter}')
  print(f'Records count: {records_counter}')

def series_print(name, x):
  print(f'First {x} series from {name}')

  buffer = ReadBuffer(name)
  for i in range(x):
    print(f'Series {i}:')

    iterator = Iterator(buffer)

    for r in iterator:
      print(r)

def series_count(name):
  series_counter = 0
  buffer = ReadBuffer(name)
  while buffer.check_more():
    iterator = Iterator(buffer)
    for _ in Iterator(buffer):
      pass
    series_counter +=1
  return series_counter

In [191]:
def split(first_tape, second_tape,tape):

  t1_buffer = ReadBuffer(tape)
  t2_buffer = WriteBuffer(first_tape)
  t3_buffer = WriteBuffer(second_tape)

  last_record = t1_buffer.read_next()
  t2_buffer.write_next(last_record)

  destination_buffer = t2_buffer

  for r in t1_buffer:
    if r < last_record:
      destination_buffer = t3_buffer if destination_buffer == t2_buffer else t2_buffer

    (t2_buffer if destination_buffer == t2_buffer else t3_buffer).write_next(r)
    last_record = r

  t2_buffer.flush()
  t3_buffer.flush()

  #dodane
  print("Tasma 1: ")
  print(t2_buffer)
  print_tape(first_tape)
  print("Tasma 2 :")
  print_tape(second_tape)

  return MetaInfo(t1_buffer.disk_reads_count,
  # t2_buffer.disk_writes_count + t3_buffer.disk_writes_count: Łączna liczba operacji zapisu na dysku dla obu buforów
  t2_buffer.disk_writes_count + t3_buffer.disk_writes_count,
  # t2_buffer.series_written + t3_buffer.series_written: Łączna liczba sekwencji posortowanych (runs) zapisanych do obu buforów
  t2_buffer.series_written + t3_buffer.series_written)

In [192]:
def series_merge(series_1, series_2, write_buffer: WriteBuffer):
  current_ser1 = series_1.read_next()
  current_ser2 = series_2.read_next()

  while current_ser1 is not None and current_ser2 is not None:
    if current_ser1 < current_ser2:

      write_buffer.write_next(current_ser1)
      current_ser1 =  series_1.read_next()
    else:

      write_buffer.write_next(current_ser2)
      current_ser2 = series_2.read_next()

  for series in (series_1, series_2):
    current = current_ser1 if series == series_1 else current_ser2
    if current is not None:
      write_buffer.write_next(current)
      for remaining_records in series:
        write_buffer.write_next(remaining_records)

In [193]:
def merge(first_tape, second_tape, tape):
  t1_buffer = WriteBuffer(tape)

  t2_buffer = ReadBuffer(first_tape)# tasma 1
  t3_buffer = ReadBuffer(second_tape)# tasma 2

  while t2_buffer.check_more() and t3_buffer.check_more():
    series_merge(Iterator(t2_buffer), Iterator(t3_buffer), t1_buffer)

  for buffer in (t2_buffer, t3_buffer):
    for r in buffer:
        t1_buffer.write_next(r)

  t1_buffer.flush()

  return MetaInfo(t2_buffer.disk_reads_count + t3_buffer.disk_reads_count,
                  t1_buffer.disk_writes_count,
                  t1_buffer.series_written)

In [194]:
def prepare_tapes():

  tape1 = WriteBuffer("tapes/t1")

  for r in ReadBuffer("tapes/start_tape"):
    tape1.write_next(r)

  tape1.flush()

  for tape in ("t2", "t3"):
    path = f"tapes/{tape}"
    if os.path.isfile(path):
        os.remove(path)

In [195]:
def tape_sort(tape, print_after_phase = False):

  phases_counter = 0
  series_written = 0

  reads_count = 0
  writes_count = 0

  while series_written != 1:

    split_information = split("tapes/t2", "tapes/t3", tape)
    merge_information = merge("tapes/t2", "tapes/t3", tape)

    series_written = merge_information.series_count

    reads_count += split_information.reads_count
    reads_count +=merge_information.reads_count

    writes_count += split_information.writes_count
    writes_count += merge_information.writes_count

    if print_after_phase:
      print(f'Phase: {phases_counter + 1}')

      print_tape(tape)

  #print(f'{series_written} series remaining')
  phases_counter +=1


In [196]:
class MetaInfo:
    def __init__(self, reads_count, writes_count, seires_count):

        # reads_count: Liczba operacji odczytu z dysku
        self.reads_count = reads_count

        # writes_count: Liczba operacji zapisu na dysk
        self.writes_count = writes_count

        # runs_count: Liczba przebiegów (runs) wykonanych podczas sortowania
        self.series_count = series_count

In [197]:
tape = "tapes/t1"
test_file_path = "tapes/test"
options = "v"

In [198]:
write_buffer = WriteBuffer(tape, append_mode=True)
count = 0

with open(test_file_path) as test_file:
  for line in test_file:
    set_numbers = [int(s) for s in line.rstrip().split()]
    new_record = Record(set_numbers)
    write_buffer.write_next(new_record)
    count += 1
write_buffer.flush()
print(f'Added {count} records to tape')

AttributeError: 'Record' object has no attribute 'elements_copy'

In [185]:
#print(f"Displaying tape {tape}")
#print_tape(tape)

In [181]:
print(f"[ .. ] Sorting tape {tape}")
print(f"[ -v ] Displaying tape before sorting:")
print_tape(tape)

sort_info = tape_sort(tape)
print(f"Displaying tape after sorting:")
print_tape(tape)
print(f"Tape {tape} sorted!")
print(f"Sorting metadata:")
print(f"Phase count: {sort_info.phases_count}")

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
Set [42]
Set [44]
Set [44]
Set [44]
Set [55]
Set [55]
Set [55]
Set [67]
Set [67]
Set [67]
Set [94]
Set [94]
Set [94]
---- series end ----
Set [6]
Set [6]
Set [6]
Set [12]
Set [12]
Set [12]
Set [18]
Set [18]
Set [42]
Set [42]
Set [42]
Set [44]
Set [44]
Set [44]
Set [55]
Set [55]
Set [55]
Set [67]
Set [67]
Set [67]
Set [94]
Set [94]
Set [94]
---- series end ----
Set [6]
Set [6]
Set [6]
Set [12]
Set [12]
Set [18]
Set [18]
Set [42]
Set [42]
Set [44]
Set [44]
Set [44]
Set [44]
Set [55]
Set [55]
Set [55]
Set [55]
Set [67]
Set [67]
Set [67]
Set [94]
Set [94]
---- series end ----
Set [6]
Set [6]
Set [6]
Set [6]
Set [12]
Set [12]
Set [12]
Set [12]
Set [18]
Set [18]
Set [42]
Set [42]
Set [42]
Set [42]
Set [44]
Set [44]
Set [44]
Set [55]
Set [55]
Set [55]
Set [67]
Set [67]
Set [67]
Set [67]
Set [94]
Set [94]
Set [94]
Set [94]
---- series end ----
Set [6]
Set [6]
Set [6]
Set [12]
Set [12]
Set [18]
Set [18]
Set [18]
Set 

KeyboardInterrupt: 