# Libraries

In [1]:
import re
import pandas as pd

# Load data

In [2]:
# Load games requirements table
games_requirements_data = pd.read_csv('../tables/system_requirement_website_data.csv', index_col=0)

# Standardize GPU names functions

In [3]:
# Standardizes NVIDIA GPU names in a DataFrame column by extracting and formatting model information
def standardize_nvidia_gpus(df, column):
    
    # Regex pattern to identify NVIDIA GPU models and components
    pattern = re.compile(r"""
        (?:NVIDIA\s*)?  # Optional "NVIDIA" prefix
        (?:GeForce|GTX|RTX|GT|Quadro|TITAN)\s*  # Recognized GPU series
        (?:[-®™]?\s*)?  # Optional special characters
        (\d{2,4})\s*  # Numerical part (e.g., 9800, 1660, 1080 Ti)
        (SUPER|Ti|Laptop\sGPU|Max-Q)?  # Optional suffixes (SUPER, Ti, etc.)
    """, re.IGNORECASE | re.VERBOSE)
    
    # Inner function to process individual GPU strings
    def extract_and_standardize(gpu_str):

        if not isinstance(gpu_str, str):
            return None  # Return None for non-string values
        
        # Remove "VIDEO CARD:" prefix if present
        gpu_str = re.sub(r"VIDEO CARD:\s*", "", gpu_str, flags=re.IGNORECASE)
        matches = pattern.findall(gpu_str)
        
        standardized_cards = []
        for number, suffix in matches:
            # Determine GPU generation prefix
            base_name = (
                f"NVIDIA GeForce GTX {number}"
                if int(number) < 1000
                else f"NVIDIA GeForce RTX {number}"
            )
            
            # Add suffix if present
            if suffix:
                base_name += f" {suffix.upper()}"
            
            standardized_cards.append(base_name)
        
        # Return first match or None if no matches
        return standardized_cards[0] if standardized_cards else None
    
    # Apply standardization to specified column
    df[column] = df[column].apply(extract_and_standardize)
    
    return df

# Standardize CPU names functions

In [4]:
# Standardizes Intel CPU names in a DataFrame column by extracting and formatting processor information
def standardize_intel_cpus(df, column):

    # Regex pattern for modern Intel Core processors (i3/i5/i7/i9)
    core_pattern = re.compile(r"""
        (?:Intel\s*Core\s*i?  # Base identifier
        |Core\s*i?            # Alternative without Intel prefix
        |Intel\s*i?)           # Short version
        (\d{1,2})              # Generation number (e.g., 3, 5, 7)
        [- ]?                  # Optional separator
        (\d+)                  # Model number (e.g., 10700, 12900)
        ([A-Za-z]*)            # Suffix letters (e.g., K, F, HK)
    """, re.IGNORECASE | re.VERBOSE)

    # Regex pattern for legacy Core 2 processors
    legacy_pattern = re.compile(r"""
        Intel\s*Core\s*2\s*   # Legacy processor line
        (Duo|Quad)\s*         # Processor type
        ([A-Za-z0-9]+)        # Model designation
    """, re.IGNORECASE | re.VERBOSE)

    # Inner function to process individual CPU strings
    def extract_and_standardize(cpu_str):

        if not isinstance(cpu_str, str):
            return None  # Return None for non-string values

        # Remove special characters
        cleaned_str = re.sub(r'[®™]', '', cpu_str)

        # Try modern Core processors first
        core_match = core_pattern.search(cleaned_str)
        if core_match:
            series, model, suffix = core_match.groups()
            return f'Intel® Core™ i{series}-{model}{suffix}'.strip()

        # Try legacy Core 2 processors
        legacy_match = legacy_pattern.search(cleaned_str)
        if legacy_match:
            type_, model = legacy_match.groups()
            return f'Intel® Core™ 2 {type_} {model}'.strip()

        return None

    # Apply standardization to specified column
    df[column] = df[column].apply(extract_and_standardize)
    
    return df

# Standardize RAM capacity function

In [5]:
# Function to extract only numbers and append "GB"
def standardize_ram(value):
    num = ''.join(filter(str.isdigit, value))  # Keep only digits
    return f"{num} GB" if num else value  # Append "GB" if numbers found

# Aplly all changes

In [6]:
# Apply changes for GPU
games_requirements_data = standardize_nvidia_gpus(games_requirements_data, 'gpu_minimum')
games_requirements_data = standardize_nvidia_gpus(games_requirements_data, 'gpu_recommended')

# Apply changes for CPU
games_requirements_data = standardize_intel_cpus(games_requirements_data, 'cpu_minimum')
games_requirements_data = standardize_intel_cpus(games_requirements_data, 'cpu_recommended')

# Apply changes for RAM
games_requirements_data['ram_minimum'] = games_requirements_data['ram_minimum'].apply(standardize_ram)
games_requirements_data['ram_recommended'] = games_requirements_data['ram_recommended'].apply(standardize_ram)

# Store game requirements data

In [7]:
# Save data
games_requirements_data.to_csv('../tables/games_requirement_data.csv', index=True)