# Libraries

In [1]:
import re
import pandas as pd

# Load data

In [2]:
# Load games requirements table
games_requirement_data = pd.read_csv('../tables/system_requirement_website_data.csv', index_col=0)

# Load benchmark score table
benchmark_score_data = pd.read_csv('../tables/benchmark_score_data.csv', index_col=0)

# Standardize GPU names functions

In [3]:
def standardize_nvidia_gpus(df, column):
    # Regex pattern to identify NVIDIA GPU models and components
    pattern = re.compile(r"""
        (?:NVIDIA\s*)?          # Optional "NVIDIA" prefix
        (?:GeForce\s+)?         # Optional "GeForce" followed by whitespace
        (GTX|RTX|GT|Quadro|TITAN)\s*  # Captured model family (GTX, RTX, etc.)
        (?:[-®™]?\s*)?         # Optional special characters
        (\d{2,4})              # Numerical part (e.g., 9800, 1660, 1080 Ti)
        \s*                     # Optional whitespace after number
        (SUPER|Ti|Laptop\sGPU|Max-Q)?  # Optional suffixes (case-insensitive)
    """, re.IGNORECASE | re.VERBOSE)
    
    def extract_and_standardize(gpu_str):
        if not isinstance(gpu_str, str):
            return None
        
        gpu_str = re.sub(r"VIDEO CARD:\s*", "", gpu_str, flags=re.IGNORECASE)
        matches = pattern.findall(gpu_str)
        
        standardized_cards = []
        for series, number, suffix in matches:
            # Determine product line based on model family
            series_upper = series.upper()
            if series_upper in ['GTX', 'RTX', 'GT']:
                product_line = 'GeForce'
            elif series_upper == 'QUADRO':
                product_line = 'Quadro'
            elif series_upper == 'TITAN':
                product_line = 'TITAN'
            else:
                product_line = 'GeForce'  # Default to GeForce
            
            base_name = f"NVIDIA {product_line} {series_upper} {number}"
            
            # Process suffix
            if suffix:
                suffix_lower = suffix.strip().lower()
                if suffix_lower == 'ti':
                    standardized_suffix = 'Ti'
                elif suffix_lower == 'super':
                    standardized_suffix = 'SUPER'
                elif suffix_lower == 'laptop gpu':
                    standardized_suffix = 'Laptop GPU'
                elif suffix_lower == 'max-q':
                    standardized_suffix = 'Max-Q'
                else:
                    standardized_suffix = suffix.strip().title()
                base_name += f" {standardized_suffix}"
            
            standardized_cards.append(base_name)
        
        return standardized_cards[0] if standardized_cards else None
    
    df[column] = df[column].apply(extract_and_standardize)
    return df

# Standardize CPU names functions

In [4]:
# Standardizes Intel CPU names in a DataFrame column by extracting and formatting processor information
def standardize_intel_cpus(df, column):

    # Regex pattern for modern Intel Core processors (i3/i5/i7/i9)
    core_pattern = re.compile(r"""
        (?:Intel\s*Core\s*i?  # Base identifier
        |Core\s*i?            # Alternative without Intel prefix
        |Intel\s*i?)           # Short version
        (\d{1,2})              # Generation number (e.g., 3, 5, 7)
        [- ]?                  # Optional separator
        (\d+)                  # Model number (e.g., 10700, 12900)
        ([A-Za-z]*)            # Suffix letters (e.g., K, F, HK)
    """, re.IGNORECASE | re.VERBOSE)

    # Regex pattern for legacy Core 2 processors
    legacy_pattern = re.compile(r"""
        Intel\s*Core\s*2\s*   # Legacy processor line
        (Duo|Quad)\s*         # Processor type
        ([A-Za-z0-9]+)        # Model designation
    """, re.IGNORECASE | re.VERBOSE)

    # Inner function to process individual CPU strings
    def extract_and_standardize(cpu_str):

        if not isinstance(cpu_str, str):
            return None  # Return None for non-string values

        # Remove special characters
        cleaned_str = re.sub(r'[®™]', '', cpu_str)

        # Try modern Core processors first
        core_match = core_pattern.search(cleaned_str)
        if core_match:
            series, model, suffix = core_match.groups()
            return f'Intel® Core™ i{series}-{model}{suffix}'.strip()

        # Try legacy Core 2 processors
        legacy_match = legacy_pattern.search(cleaned_str)
        if legacy_match:
            type_, model = legacy_match.groups()
            return f'Intel® Core™ 2 {type_} {model}'.strip()

        return None

    # Apply standardization to specified column
    df[column] = df[column].apply(extract_and_standardize)
    
    return df

# Standardize RAM capacity function

In [5]:
# Function to extract only numbers and append "GB"
def standardize_ram(value):
    num = ''.join(filter(str.isdigit, value))  # Keep only digits
    return f"{num} GB" if num else value  # Append "GB" if numbers found

# Aplly all changes

In [6]:
# Apply changes for GPU
games_requirement_data = standardize_nvidia_gpus(games_requirement_data, 'gpu_minimum')
games_requirement_data = standardize_nvidia_gpus(games_requirement_data, 'gpu_recommended')
benchmark_score_data = standardize_nvidia_gpus(benchmark_score_data, 'gpu_name')

# Apply changes for CPU
games_requirement_data = standardize_intel_cpus(games_requirement_data, 'cpu_minimum')
games_requirement_data = standardize_intel_cpus(games_requirement_data, 'cpu_recommended')

# Apply changes for RAM
games_requirement_data['ram_minimum'] = games_requirement_data['ram_minimum'].apply(standardize_ram)
games_requirement_data['ram_recommended'] = games_requirement_data['ram_recommended'].apply(standardize_ram)

# Store game requirements data

In [7]:
# Save data
games_requirement_data.to_csv('../tables/games_requirement_data.csv', index=True)
benchmark_score_data.to_csv('../tables/benchmark_score_data.csv', index=True)