In [1]:
import polars as pl
import numpy as np
import unicodedata
import string
import sqlalchemy as _sql
import sqlalchemy.ext.declarative as _declarative
import sqlalchemy.orm as _orm

In [2]:
# TODO: change to real database
# DATABASE_URL = "postgresql+psycopg2://db_user:password@db:5432/inzynierka_db"
# engine = _sql.create_engine(DATABASE_URL)
# SessionLocal = _orm.sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Base = _declarative.declarative_base()

# Data reading

In [3]:
class ReadData:
    
    def __init__(self, path):
        """
        Initialize with the path to the CSV file.
        """
        self.path = path    # Can be replaced by database connection later
        self.data = None
        
    def read_data(self):
        """
        Read data from the CSV file using polars.
        Returns a polars DataFrame.
        """
        # Define columns (as there are no headers in the CSV)
        columns = ["user_id", "game_name", "purchase_play", "hours_played", "0"]  # For now, assume no headers
        self.data = pl.read_csv(self.path, has_header=False, new_columns=columns)
        self.data = pl.DataFrame(self.data, strict=False)

        
        return self.data

In [4]:
path = "data/steam-200k.csv"
rd = ReadData(path)
data = rd.read_data()
data

user_id,game_name,purchase_play,hours_played,0
i64,str,str,f64,i64
151603712,"""The Elder Scrolls V Skyrim""","""purchase""",1.0,0
151603712,"""The Elder Scrolls V Skyrim""","""play""",273.0,0
151603712,"""Fallout 4""","""purchase""",1.0,0
151603712,"""Fallout 4""","""play""",87.0,0
151603712,"""Spore""","""purchase""",1.0,0
…,…,…,…,…
128470551,"""Titan Souls""","""play""",1.5,0
128470551,"""Grand Theft Auto Vice City""","""purchase""",1.0,0
128470551,"""Grand Theft Auto Vice City""","""play""",1.5,0
128470551,"""RUSH""","""purchase""",1.0,0


# Data Pre-processing

- Normalization (lowercasing, removing diacritics, punctuations)
- Tokenization (TBD?)
- Drop duplicates

In [5]:
import polars as pl
import unicodedata
import string
import re

class DataPreprocessing:
    
    def __init__(self, data):
        """
        Initialize the DataPreprocessor with the data.
        :param data: polars DataFrame containing the data to be processed.
        """
        self.data = data
        self.columns = None
        self.processed_data = data.clone()  # A copy of the data to avoid modifying the original
        
    def select_columns(self, columns):
        """
        Select the columns to apply preprocessing on.
        If 'all' is passed, all columns will be selected.
        :param columns: List of columns to be normalized, or 'all' to select all columns.
        """
        if columns[0] == 'all':
            # Select all columns in the DataFrame
            self.columns = self.data.columns
        else:
            # Otherwise, use the provided list of columns
            self.columns = columns
    
    def _ensure_non_numeric_string_columns(self):
        """
        Internal method to ensure that only non-numeric string columns are selected for string operations.
        """
        # Filter out non-string columns (Int64, Float64, etc.)
        self.columns = [col for col in self.columns if self.data.schema[col] == pl.Utf8]

    def lowercase(self):
        """
        Convert text to lowercase in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")
        
        try:
            self.processed_data = self.processed_data.with_columns([
                pl.col(col).str.to_lowercase() for col in self.columns
            ])
        except Exception as e:
            print(f"Error applying lowercase operation: {e}")
    
    def remove_diacritics(self):
        """
        Remove diacritics from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")
        
        def _remove_diacritics(text):
            return ''.join(
                c for c in unicodedata.normalize('NFKD', text) 
                if unicodedata.category(c) != 'Mn'
            )
        
        try:
            # Apply the diacritics removal function using map_elements
            self.processed_data = self.processed_data.with_columns([
                pl.col(col).map_elements(_remove_diacritics, return_dtype=pl.Utf8).alias(col) for col in self.columns
            ])
        except Exception as e:
            print(f"Error removing diacritics: {e}")

    def remove_punctuation(self):
        """
        Remove punctuation from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        # Ensure only string columns are processed
        self._ensure_non_numeric_string_columns()

        # Escape the square brackets in the regex to avoid parsing errors
        punctuation_pattern = re.escape(string.punctuation)

        try:
            # Apply punctuation removal only to string columns
            self.processed_data = self.processed_data.with_columns([
                pl.col(col).str.replace_all(f"[{punctuation_pattern}]", '').alias(col) for col in self.columns
            ])
        except Exception as e:
            print(f"Error removing punctuation: {e}")
    
    def drop_duplicates(self):
        """
        Drop exact duplicates across all columns in the DataFrame. This is a mandatory step.
        """
        try:
            self.processed_data = self.processed_data.unique()
        except Exception as e:
            print(f"Error dropping duplicates: {e}")

    def apply_preprocessing(self, lowercase=False, diacritics_removal=False, punctuation_removal=False):
        """
        Apply preprocessing steps based on user selection.
        The order is: lowercase -> diacritics removal -> punctuation removal -> drop exact duplicates.
        :param lowercase: If True, apply lowercasing to the selected columns.
        :param diacritics_removal: If True, remove diacritics from the selected columns.
        :param punctuation_removal: If True, remove punctuation from the selected columns.
        :return: Preprocessed polars DataFrame.
        """
        try:
            # Apply preprocessing steps based on user input
            if punctuation_removal:
                self.remove_punctuation()
            
            # Ensure non-numeric string columns are processed
            self._ensure_non_numeric_string_columns()
            
            if lowercase:
                self.lowercase()
            
            if diacritics_removal:
                self.remove_diacritics()
            
            # Drop exact duplicates as the mandatory last step
            self.drop_duplicates()
        
        except Exception as e:
            print(f"Error during preprocessing: {e}")
        
        return self.processed_data
    
    def get_processed_data(self):
        """
        Return the preprocessed data.
        :return: Preprocessed polars DataFrame.
        """
        return self.processed_data


In [7]:
preprocessor = DataPreprocessing(data)

preprocessor.select_columns(columns=['all'])
preprocessor.apply_preprocessing(lowercase=True, diacritics_removal=True, punctuation_removal=True)
data = preprocessor.get_processed_data()
data

user_id,game_name,purchase_play,hours_played,0
i64,str,str,f64,i64
104727643,"""counterstrike global offensive""","""purchase""",1.0,0
34901647,"""brtal legend""","""play""",14.7,0
62990992,"""counterstrike""","""purchase""",1.0,0
190061555,"""dota 2""","""play""",0.8,0
53875128,"""jazzpunk""","""purchase""",1.0,0
…,…,…,…,…
47672756,"""call of duty modern warfare 2 …","""play""",8.3,0
38465050,"""lego marvel super heroes""","""play""",53.0,0
66255019,"""dr langeskov the tiger and the…","""purchase""",1.0,0
181010210,"""dota 2""","""play""",0.3,0


# Block building

- Sorted Neighborhood Method (SNM)
- Standard Blocking Method


In [51]:
import polars as pl

class BlockBuilding:
    
    def __init__(self, data, method):
        """
        Initialize the BlockBuilder with data and method.
        :param data: polars DataFrame containing the entity data.
        :param method: The blocking method to use ('sorted_neighborhood' or 'standard_blocking').
        """
        self.data = data
        self.method = method
        self.blocks = None
        self.num_blocks = 0

    def build_blocks(self, window_size=None, columns=None, block_index=1):
        """
        Main function to build blocks using the selected method.
        :param window_size: Window size for sorted neighborhood method.
        :param columns: List of columns to generate the sorting key (for sorted neighborhood) or blocking key (for standard blocking).
        :param block_index: Index of the block to display (optional).
        :return: A specific block based on block_index.
        """
        # Ensure the required columns and parameters are provided
        if columns is None:
            raise ValueError("You must specify the columns for generating sorting or blocking keys.")
        
        # Step 1: Perform block building based on the method selected
        if self.method == 'sorted_neighborhood':
            if window_size is None:
                raise ValueError("Window size must be provided for the sorted neighborhood method.")
            self.sorted_neighborhood(window_size, columns)
        elif self.method == 'standard_blocking':
            self.standard_blocking(columns)
        else:
            raise ValueError("Invalid method. Use 'sorted_neighborhood' or 'standard_blocking'.")

        # Optional Step 2: Display a specific block (e.g., block_index)
        return self.display_block(block_index)

    def sorted_neighborhood(self, window_size, columns):
        """
        Build blocks using the Sorted Neighborhood Method (SNM).
        :param window_size: The size of the sliding window.
        :param columns: List of columns to generate the sorting key.
        """
        # Step 1: Ensure all columns are cast to strings before concatenation
        self.blocks = self.data.clone()

        # Step 2: Use map_elements to concatenate the columns and generate the sorting key
        self.blocks = self.blocks.with_columns(
            pl.struct(columns).map_elements(lambda x: " ".join([str(v) for v in x]), return_dtype=pl.Utf8).alias('sorting_key')
        )

        # Step 3: Sort records by sorting key (SKV)
        self.blocks = self.blocks.sort('sorting_key')

        # Step 4: Generate block IDs with correct length
        num_rows = self.blocks.height
        block_ids = [i // window_size + 1 for i in range(num_rows)]

        # Assign block IDs to the records
        self.blocks = self.blocks.with_columns([pl.Series('block_id', block_ids)])

        # Update the number of blocks
        self.num_blocks = len(set(block_ids))


    def standard_blocking(self, columns: list):
        """
        Perform standard blocking on the data using the specified columns.
        Group the data by unique values in the specified columns and assign block IDs.
        
        :param columns: List of column names to group by for blocking.
        :return: DataFrame with block_id and blocking_index columns added.
        """
        
        # Step 1: Create a distinct DataFrame with unique combinations of the grouping columns
        distinct_groups = self.data.select(columns).unique()
        
        # Step 2: Assign a unique block_id and create blocking_index for each group
        distinct_groups = distinct_groups.with_row_index("block_id")  # Assign block IDs
        distinct_groups = distinct_groups.with_columns([
            pl.concat_list([pl.col(col) for col in columns]).alias("blocking_index")
        ])
        
        self.blocks = self.data.join(distinct_groups, on=columns, how='left').sort('block_id')
        

    def display_block(self, block_index=1):
        """
        Display a specific block by block_id.
        :param block_index: The index of the block to display.
        :return: DataFrame containing the specified block.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")
        
        return self.blocks.filter(pl.col('block_id') == block_index)
    
    def get_blocks(self):
        """
        Return all generated blocks.
        :return: DataFrame containing all blocks.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")
        
        return self.blocks


In [71]:
block_builder = BlockBuilding(data, method='standard_blocking')
blocks = block_builder.build_blocks(window_size=10, columns=['game_name', 'purchase_play'])
all_blocks = block_builder.get_blocks()
all_blocks
block_builder.display_block(667)

user_id,game_name,purchase_play,hours_played,0,block_id,blocking_index
i64,str,str,f64,i64,u32,list[str]
69009454,"""the book of unwritten tales""","""play""",0.3,0,667,"[""the book of unwritten tales"", ""play""]"
32467994,"""the book of unwritten tales""","""play""",38.0,0,667,"[""the book of unwritten tales"", ""play""]"
31669242,"""the book of unwritten tales""","""play""",3.2,0,667,"[""the book of unwritten tales"", ""play""]"
42657809,"""the book of unwritten tales""","""play""",10.3,0,667,"[""the book of unwritten tales"", ""play""]"
11373749,"""the book of unwritten tales""","""play""",15.4,0,667,"[""the book of unwritten tales"", ""play""]"
124437057,"""the book of unwritten tales""","""play""",21.0,0,667,"[""the book of unwritten tales"", ""play""]"
156379077,"""the book of unwritten tales""","""play""",0.5,0,667,"[""the book of unwritten tales"", ""play""]"
23225650,"""the book of unwritten tales""","""play""",0.6,0,667,"[""the book of unwritten tales"", ""play""]"
84455820,"""the book of unwritten tales""","""play""",0.3,0,667,"[""the book of unwritten tales"", ""play""]"


# Comparison cleaning
### Tu zrobimy porównania oparte o prawdopodobieństwie na danych kolumnach, np. fuzzy string matching, jakieś grafy prawdopodobieństwa, idk