In [2]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def load_rfq_data():
    """Load and perform initial inspection of RFQ and reference data."""
    
    # Define paths
    task2_dir = Path("../../resources/task_2")
    rfq_path = task2_dir / "rfq.csv"
    reference_path = task2_dir / "reference_properties.tsv"
    
    # Load datasets
    print("Loading RFQ data...")
    rfq_df = pd.read_csv(rfq_path)
    print(f"RFQ data shape: {rfq_df.shape}")
    
    print("\nLoading reference properties...")
    reference_df = pd.read_csv(reference_path, sep='\t')
    print(f"Reference data shape: {reference_df.shape}")
    
    # Basic data inspection  
    print(f"\nUnique grades in RFQ: {rfq_df['grade'].nunique()}")
    print(f"Missing grades in RFQ: {rfq_df['grade'].isnull().sum()}")
    
    print(f"\nUnique grades in Reference: {reference_df['Grade/Material'].nunique()}")
    
    return rfq_df, reference_df

# Execute the loading function
if __name__ == "__main__":
    rfq_df, reference_df = load_rfq_data()
    
    # Display first few rows to understand structure
    # print("\nFirst 3 RFQ rows:")
    # print(rfq_df.head(3))
    
    # print("\nFirst 3 Reference rows:")
    # print(reference_df.head(3))

Loading RFQ data...
RFQ data shape: (1000, 25)

Loading reference properties...
Reference data shape: (175, 34)

Unique grades in RFQ: 158
Missing grades in RFQ: 59

Unique grades in Reference: 175


In [6]:
def normalize_grades(rfq_df, reference_df):
    """
    Normalize grade keys and handle grade matching between RFQ and reference data.
    
    Task B.1: Normalize grade keys (case, suffixes, aliases) and join RFQs with reference.
    """
    
    print("=== TASK B.1: Grade Normalization and Reference Join ===")
    
    # Create working copies
    rfq_work = rfq_df.copy()
    ref_work = reference_df.copy()
    
    # Step 1: Normalize grade formats
    def clean_grade(grade):
        """Clean and normalize grade strings."""
        if pd.isna(grade):
            return None
        
        # Convert to string, strip whitespace, uppercase
        grade = str(grade).strip().upper()
        
        # Remove common prefixes/suffixes that might cause mismatches
        # Remove trailing numbers that might be variants (e.g., S235JR -> S235)
        # But be careful not to remove essential numbers (e.g., keep S700MC as is)
        
        return grade
    
    # Apply grade normalization
    rfq_work['grade_normalized'] = rfq_work['grade'].apply(clean_grade)
    ref_work['grade_normalized'] = ref_work['Grade/Material'].apply(clean_grade)
    
    # Step 2: Analyze grade matching
    print(f"\nGrade Analysis:")
    print(f"Unique normalized grades in RFQ: {rfq_work['grade_normalized'].nunique()}")
    print(f"Unique normalized grades in Reference: {ref_work['grade_normalized'].nunique()}")
    
    # Find overlap and missing grades
    rfq_grades = set(rfq_work['grade_normalized'].dropna())
    ref_grades = set(ref_work['grade_normalized'])
    
    common_grades = rfq_grades.intersection(ref_grades)
    rfq_missing_in_ref = rfq_grades - ref_grades
    ref_not_in_rfq = ref_grades - rfq_grades
    
    print(f"\nGrades found in both datasets: {len(common_grades)}")
    print(f"RFQ grades missing in reference: {len(rfq_missing_in_ref)}")
    print(f"Reference grades not in RFQ: {len(ref_not_in_rfq)}")
    
    if rfq_missing_in_ref:
        print(f"\nSample RFQ grades missing in reference: {list(rfq_missing_in_ref)[:10]}")
    
    # Step 3: Create grade mapping for better matching
    # Handle common aliases and variants
    def create_grade_mapping(rfq_grades, ref_grades):
        """Create mapping for grade aliases and variants."""
        mapping = {}
        
        # Direct matches first
        for grade in rfq_grades:
            if grade in ref_grades:
                mapping[grade] = grade
        
        # Handle common aliases (you can expand this based on domain knowledge)
        alias_rules = {
            # Add specific mappings if you know them
            # Example: 'S235' might map to 'S235JR'
        }
        
        # Try to match truncated versions
        for rfq_grade in rfq_grades:
            if rfq_grade not in mapping:
                # Try to find partial matches
                for ref_grade in ref_grades:
                    if rfq_grade in ref_grade or ref_grade in rfq_grade:
                        # Be careful with partial matching - only if reasonable
                        if len(rfq_grade) >= 4:  # Avoid matching very short strings
                            mapping[rfq_grade] = ref_grade
                            break
        
        return mapping
    
    grade_mapping = create_grade_mapping(rfq_grades, ref_grades)
    print(f"\nSuccessful grade mappings created: {len(grade_mapping)}")
    
    # Step 4: Apply mapping and join
    rfq_work['grade_mapped'] = rfq_work['grade_normalized'].map(grade_mapping)
    
    # Join RFQ with reference data
    enriched_rfq = rfq_work.merge(
        ref_work, 
        left_on='grade_mapped', 
        right_on='grade_normalized',
        how='left',
        suffixes=('', '_ref')
    )
    
    print(f"\nJoin Results:")
    print(f"Total RFQ records: {len(rfq_work)}")
    print(f"Records with reference data: {enriched_rfq['Grade/Material'].notna().sum()}")
    print(f"Records missing reference data: {enriched_rfq['Grade/Material'].isna().sum()}")
    
    return enriched_rfq, grade_mapping

# Execute the normalization and joining
enriched_rfq, grade_mapping = normalize_grades(rfq_df, reference_df)

# Display results
print(f"\nEnriched dataset shape: {enriched_rfq.shape}")
# Save the enriched dataset to a CSV file
enriched_rfq.to_csv('enriched_dataset.csv', index=False)
print("Enriched dataset has been saved as 'enriched_dataset.csv'")

=== TASK B.1: Grade Normalization and Reference Join ===

Grade Analysis:
Unique normalized grades in RFQ: 156
Unique normalized grades in Reference: 173

Grades found in both datasets: 156
RFQ grades missing in reference: 0
Reference grades not in RFQ: 17

Successful grade mappings created: 156

Join Results:
Total RFQ records: 1000
Records with reference data: 946
Records missing reference data: 59

Enriched dataset shape: (1005, 62)
Enriched dataset has been saved as 'enriched_dataset.csv'


In [12]:
import re
import pandas as pd
from difflib import get_close_matches

def normalize_grades(rfq_df, reference_df):
    """
    Normalize grade keys and handle grade matching between RFQ and reference data.
    
    Task B.1: Normalize grade keys (case, suffixes, aliases) and join RFQs with reference.
    FIXED: Eliminates duplicate rows and reduces redundant columns.
    """
    
    print("=== TASK B.1: Grade Normalization and Reference Join ===")
    
    # Create working copies
    rfq_work = rfq_df.copy()
    ref_work = reference_df.copy()
    
    # Step 1: Normalize grade formats
    def clean_grade(grade):
        """Automatically clean and normalize grade strings."""
        if pd.isna(grade):
            return None
        
        grade = str(grade).strip().upper()
        
        # Remove delivery condition suffixes (+N, +QT, +C)
        grade = re.sub(r"\+.*$", "", grade)
        
        # Remove spaces and dashes
        grade = grade.replace(" ", "").replace("-", "")
        
        # Normalize Werkstoffnummer like "1.2343" (keep consistent format)
        if re.match(r"^\d\.\d{3,4}$", grade):
            return grade
        
        # DX grades: add missing "D" at end if not present
        if re.match(r"^DX\d{2}$", grade):
            grade = grade + "D"
        
        return grade
    
    # Apply grade normalization
    rfq_work['grade_normalized'] = rfq_work['grade'].apply(clean_grade)
    ref_work['grade_normalized'] = ref_work['Grade/Material'].apply(clean_grade)
    
    # Step 1.5: DEDUPLICATE REFERENCE DATA to prevent duplicate joins
    print(f"Reference data before deduplication: {len(ref_work)} rows")
    
    def select_best_reference(group):
        """Select the best reference entry when multiple exist for same normalized grade."""
        if len(group) == 1:
            return group.iloc[0]
        
        # Preference: exact match to normalized grade, then shortest original name
        original_upper = group['Grade/Material'].str.upper().str.replace(' ', '').str.replace('-', '')
        exact_matches = group[original_upper == group['grade_normalized']]
        
        if len(exact_matches) > 0:
            return exact_matches.iloc[0]
        
        # Otherwise, pick shortest name (likely has fewer suffixes)
        sorted_group = group.loc[group['Grade/Material'].str.len().idxmin()]
        return sorted_group
    
    ref_deduplicated = ref_work.groupby('grade_normalized', group_keys=False).apply(select_best_reference).reset_index(drop=True)
    
    print(f"Reference data after deduplication: {len(ref_deduplicated)} rows")
    print(f"Removed {len(ref_work) - len(ref_deduplicated)} duplicate reference entries")
    
    # Step 2: Analyze grade matching
    print(f"\nGrade Analysis:")
    print(f"Unique normalized grades in RFQ: {rfq_work['grade_normalized'].nunique()}")
    print(f"Unique normalized grades in Reference: {ref_deduplicated['grade_normalized'].nunique()}")
    
    rfq_grades = set(rfq_work['grade_normalized'].dropna())
    ref_grades = set(ref_deduplicated['grade_normalized'].dropna())
    
    common_grades = rfq_grades.intersection(ref_grades)
    rfq_missing_in_ref = rfq_grades - ref_grades
    ref_not_in_rfq = ref_grades - rfq_grades
    
    print(f"\nGrades found in both datasets: {len(common_grades)}")
    print(f"RFQ grades missing in reference: {len(rfq_missing_in_ref)}")
    print(f"Reference grades not in RFQ: {len(ref_not_in_rfq)}")
    
    if rfq_missing_in_ref:
        print(f"\nSample RFQ grades missing in reference: {list(rfq_missing_in_ref)[:10]}")
    
    # Step 3: Create grade mapping automatically
    def create_grade_mapping(rfq_grades, ref_grades):
        """Auto-match RFQ grades to reference grades using fuzzy + substring matching."""
        mapping = {}
        
        for g in rfq_grades:
            if g in ref_grades:
                mapping[g] = g
            else:
                # Try fuzzy close matches
                matches = get_close_matches(g, ref_grades, n=1, cutoff=0.8)
                if matches:
                    mapping[g] = matches[0]
                else:
                    # Try substring containment
                    for ref_g in ref_grades:
                        if g in ref_g or ref_g in g:
                            mapping[g] = ref_g
                            break
        return mapping
    
    grade_mapping = create_grade_mapping(rfq_grades, ref_grades)
    print(f"\nSuccessful grade mappings created: {len(grade_mapping)}")
    
    # Step 4: Apply mapping and join (now guaranteed 1:1 mapping)
    rfq_work['grade_mapped'] = rfq_work['grade_normalized'].map(grade_mapping)
    
    enriched_rfq = rfq_work.merge(
        ref_deduplicated, 
        left_on='grade_mapped', 
        right_on='grade_normalized',
        how='left',
        suffixes=('', '_ref')
    )
    
    # Step 5: Clean up redundant columns and add validation
    original_count = len(rfq_work)
    final_count = len(enriched_rfq)
    
    print(f"\nJoin Results:")
    print(f"Original RFQ records: {original_count}")
    print(f"Final enriched records: {final_count}")
    print(f"Records with reference data: {enriched_rfq['Grade/Material'].notna().sum()}")
    print(f"Records missing reference data: {enriched_rfq['Grade/Material'].isna().sum()}")
    
    # Validation check
    if final_count > original_count:
        print(f"WARNING: Still have {final_count - original_count} duplicate rows!")
        print("This shouldn't happen with the deduplication fix.")
    else:
        print("SUCCESS: No duplicate rows created!")
    
    # Clean up redundant columns to reduce confusion
    # Keep only essential grade columns
    columns_to_drop = ['grade_normalized_ref']  # This is redundant with grade_mapped
    for col in columns_to_drop:
        if col in enriched_rfq.columns:
            enriched_rfq = enriched_rfq.drop(columns=[col])
    
    print(f"\nFinal dataset shape: {enriched_rfq.shape}")
    return enriched_rfq, grade_mapping

# Execute the normalization and joining
enriched_rfq, grade_mapping = normalize_grades(rfq_df, reference_df)

# Display results
print(f"\nEnriched dataset shape: {enriched_rfq.shape}")
# Save the enriched dataset to a CSV file
enriched_rfq.to_csv('enriched_dataset.csv', index=False)
print("Fixed enriched dataset has been saved as 'enriched_dataset.csv'")

=== TASK B.1: Grade Normalization and Reference Join ===
Reference data before deduplication: 175 rows
Reference data after deduplication: 164 rows
Removed 11 duplicate reference entries

Grade Analysis:
Unique normalized grades in RFQ: 147
Unique normalized grades in Reference: 164

Grades found in both datasets: 147
RFQ grades missing in reference: 0
Reference grades not in RFQ: 17

Successful grade mappings created: 147

Join Results:
Original RFQ records: 1000
Final enriched records: 1000
Records with reference data: 941
Records missing reference data: 59
SUCCESS: No duplicate rows created!

Final dataset shape: (1000, 61)

Enriched dataset shape: (1000, 61)
Fixed enriched dataset has been saved as 'enriched_dataset.csv'


In [23]:
# Parse Range Strings into Numeric Values

def parse_range_strings(enriched_df):
    """
    Parse range strings into numeric min/max values (and optionally mid).
    
    Handles formats like: '≤0.17', '360-510 MPa', '≥235 MPa', etc.
    """
    
    print("=== STEP 3: Range String Parsing ===")
    
    df_work = enriched_df.copy()
    
    def parse_range_value(value_str):
        """
        Parse a range string and return (min_val, max_val, mid_val).
        Returns (None, None, None) if unparseable.
        """
        if pd.isna(value_str) or value_str == '':
            return None, None, None
        
        # Convert to string and clean
        value_str = str(value_str).strip()
        
        # Remove units (MPa, %, HB, HV, HRC, etc.)
        clean_str = re.sub(r'[A-Za-z%°]', '', value_str)
        clean_str = re.sub(r'\s+', ' ', clean_str).strip()
        
        try:
            # Pattern 1: ≤X or <=X (upper bound)
            if '≤' in value_str or '<=' in value_str:
                max_val = float(re.findall(r'[\d.]+', clean_str)[0])
                mid_val = max_val / 2 
                return None, max_val, mid_val
            
            # Pattern 2: ≥X or >=X (lower bound)  
            elif '≥' in value_str or '>=' in value_str:
                min_val = float(re.findall(r'[\d.]+', clean_str)[0])
                return min_val, None, None
            
            # Pattern 3: X-Y or X–Y (range)
            elif '-' in clean_str or '–' in clean_str:
                numbers = re.findall(r'[\d.]+', clean_str)
                if len(numbers) >= 2:
                    min_val = float(numbers[0])
                    max_val = float(numbers[1])
                    mid_val = (min_val + max_val) / 2
                    return min_val, max_val, mid_val
            
            # Pattern 4: Single number
            else:
                numbers = re.findall(r'[\d.]+', clean_str)
                if numbers:
                    val = float(numbers[0])
                    return None, None, val
            
        except (ValueError, IndexError):
            pass
        
        return None, None, None
    
    # Identify reference property columns that need parsing
    chemical_props = ['Carbon (C)', 'Manganese (Mn)', 'Silicon (Si)', 'Sulfur (S)', 
                     'Phosphorus (P)', 'Chromium (Cr)', 'Nickel (Ni)', 'Molybdenum (Mo)',
                     'Vanadium (V)', 'Copper (Cu)', 'Aluminum (Al)', 'Titanium (Ti)',
                     'Niobium (Nb)', 'Boron (B)', 'Nitrogen (N)']
    
    mechanical_props = ['Tensile strength (Rm)', 'Yield strength (Re or Rp0.2)', 'Elongation (A%)']
    
    properties_to_parse = chemical_props + mechanical_props
    
    # Parse each property column
    parsed_count = 0
    for prop in properties_to_parse:
        if prop in df_work.columns:
            print(f"Parsing {prop}...")
            
            # Create new columns for min, max, mid values
            prop_clean = prop.replace('(', '').replace(')', '').replace(' ', '_').replace('/', '_')
            min_col = f"{prop_clean}_min"
            max_col = f"{prop_clean}_max" 
            mid_col = f"{prop_clean}_mid"
            
            # Apply parsing
            parsed_values = df_work[prop].apply(parse_range_value)
            
            df_work[min_col] = [x[0] for x in parsed_values]
            df_work[max_col] = [x[1] for x in parsed_values]
            df_work[mid_col] = [x[2] for x in parsed_values]
            
            # Count successful parses
            non_null_count = df_work[mid_col].notna().sum()
            total_non_null = df_work[prop].notna().sum()
            
            print(f"  Successfully parsed {non_null_count}/{total_non_null} values")
            parsed_count += 1
    
    print(f"Total properties parsed: {parsed_count}")
    
    return df_work

# Execute range parsing
enriched_with_ranges = parse_range_strings(enriched_rfq)

print(f"Final enriched dataset shape: {enriched_with_ranges.shape}")
# Save the enriched_with_ranges dataset to a CSV file
enriched_with_ranges.to_csv('enriched_with_ranges.csv', index=False)
print("Fixed enriched_with_ranges dataset has been saved as 'enriched_with_ranges.csv'")

=== STEP 3: Range String Parsing ===
Parsing Carbon (C)...
  Successfully parsed 940/941 values
Parsing Manganese (Mn)...
  Successfully parsed 940/941 values
Parsing Silicon (Si)...
  Successfully parsed 539/540 values
Parsing Sulfur (S)...
  Successfully parsed 929/929 values
Parsing Phosphorus (P)...
  Successfully parsed 929/929 values
Parsing Chromium (Cr)...
  Successfully parsed 63/63 values
Parsing Nickel (Ni)...
  Successfully parsed 22/22 values
Parsing Molybdenum (Mo)...
  Successfully parsed 41/41 values
Parsing Vanadium (V)...
  Successfully parsed 227/227 values
Parsing Copper (Cu)...
  Successfully parsed 0/0 values
Parsing Aluminum (Al)...
  Successfully parsed 136/540 values
Parsing Titanium (Ti)...
  Successfully parsed 230/230 values
Parsing Niobium (Nb)...
  Successfully parsed 229/229 values
Parsing Boron (B)...
  Successfully parsed 7/7 values
Parsing Nitrogen (N)...
  Successfully parsed 136/136 values
Parsing Tensile strength (Rm)...
  Successfully parsed 940/94

In [27]:
def engineer_similarity_features(enriched_df):
    """
    Create engineered features for similarity calculation.
    
    Task B.2: 
    - Dimensions: Represent as intervals, suggest overlap metrics
    - Categorical: Define exact match (1/0) for coating, finish, form, surface_type
    - Grade properties: Use numeric midpoints of ranges; drop very sparse features (<5% coverage)
    """
    
    print("=== TASK B.2: Feature Engineering ===")
    
    df_work = enriched_df.copy()
    
    # ========================================
    # 1. DIMENSION FEATURES (Interval-based)
    # ========================================
    print("\n1. Engineering Dimension Features...")
    
    dimension_pairs = [
        ('thickness_min', 'thickness_max'),
        ('width_min', 'width_max'),
        ('length_min', 'length_min'),  # Note: length only has min in data
        ('height_min', 'height_max'),
        ('weight_min', 'weight_max'),
        ('inner_diameter_min', 'inner_diameter_max'),
        ('outer_diameter_min', 'outer_diameter_max'),
        ('yield_strength_min', 'yield_strength_max'),
        ('tensile_strength_min', 'tensile_strength_max')
    ]
    
    def create_interval_features(df, min_col, max_col, feature_name):
        """Create interval representation for dimensions."""
        df[f"{feature_name}_interval_min"] = df[min_col]
        df[f"{feature_name}_interval_max"] = df[max_col].fillna(df[min_col])
        df[f"{feature_name}_center"] = (df[f"{feature_name}_interval_min"] + df[f"{feature_name}_interval_max"]) / 2
        df[f"{feature_name}_width"] = df[f"{feature_name}_interval_max"] - df[f"{feature_name}_interval_min"]
        return df
    
    for min_col, max_col in dimension_pairs:
        if min_col in df_work.columns and max_col in df_work.columns:
            feature_name = min_col.replace('_min', '').replace('_max', '')
            df_work = create_interval_features(df_work, min_col, max_col, feature_name)
            print(f"  Created interval features for {feature_name}")
    
    # ========================================
    # 2. CATEGORICAL FEATURES (Exact Match)
    # ========================================
    print("\n2. Engineering Categorical Features...")
    
    categorical_features = ['coating', 'finish', 'form', 'surface_type', 'surface_protection']
    
    for cat_feature in categorical_features:
        if cat_feature in df_work.columns:
            df_work[f"{cat_feature}_clean"] = df_work[cat_feature].fillna('Unknown').str.strip().str.upper()
            print(f"  Standardized {cat_feature}: {df_work[f'{cat_feature}_clean'].nunique()} unique values")
    
    # ========================================
    # 3. GRADE PROPERTY FEATURES (Midpoints) with sparsity filter
    # ========================================
    print("\n3. Engineering Grade Property Features...")
    
    chemical_features = [
        'Carbon_C_mid', 'Manganese_Mn_mid', 'Silicon_Si_mid', 
        'Sulfur_S_mid', 'Phosphorus_P_mid', 'Chromium_Cr_mid',
        'Nickel_Ni_mid', 'Molybdenum_Mo_mid', 'Vanadium_V_mid',
        'Aluminum_Al_mid', 'Titanium_Ti_mid', 'Niobium_Nb_mid',
        'Boron_B_mid', 'Nitrogen_N_mid'
    ]
    
    mechanical_features = [
        'Tensile_strength_Rm_mid', 'Yield_strength_Re_or_Rp0.2_mid', 'Elongation_A%_mid'
    ]
    
    grade_features = chemical_features + mechanical_features
    min_coverage = 5  # minimum % of rows required to keep a feature
    kept_features = []
    
    print("\n  Grade Properties Availability and Filtering (min 5% coverage):")
    for feature in grade_features:
        if feature in df_work.columns:
            non_null_count = df_work[feature].notna().sum()
            total_with_ref = df_work['Grade/Material'].notna().sum()
            coverage = non_null_count / total_with_ref * 100 if total_with_ref > 0 else 0
            if coverage >= min_coverage:
                kept_features.append(feature)
            else:
                df_work.drop(columns=[feature], inplace=True)
            print(f"    {feature}: {non_null_count}/{total_with_ref} ({coverage:.1f}%) {'kept' if coverage >= min_coverage else 'dropped'}")
    
    # ========================================
    # 4. OVERLAP METRIC FUNCTIONS
    # ========================================
    print("\n4. Defining Overlap Metrics...")
    
    def interval_overlap_ratio(min1, max1, min2, max2):
        if pd.isna(min1) or pd.isna(max1) or pd.isna(min2) or pd.isna(max2):
            return 0.0
        min1, max1 = min(min1, max1), max(min1, max1)
        min2, max2 = min(min2, max2), max(min2, max2)
        overlap = max(0, min(max1, max2) - max(min1, min2))
        union = max(max1, max2) - min(min1, min2)
        return overlap / union if union > 0 else 0.0
    
    def categorical_match(val1, val2):
        if pd.isna(val1) or pd.isna(val2):
            return 0.0
        return 1.0 if val1 == val2 else 0.0
    
    df_work.attrs['interval_overlap_ratio'] = interval_overlap_ratio
    df_work.attrs['categorical_match'] = categorical_match
    
    print("  ✓ Interval overlap ratio function defined")
    print("  ✓ Categorical match function defined")
    
    # ========================================
    # 5. SUMMARY OF ENGINEERED FEATURES
    # ========================================
    print(f"\n=== Feature Engineering Summary ===")
    print(f"Final dataset shape: {df_work.shape}")
    
    interval_features = [col for col in df_work.columns if '_interval_' in col or '_center' in col or '_width' in col]
    categorical_clean = [col for col in df_work.columns if '_clean' in col]
    property_features = [col for col in df_work.columns if '_mid' in col]
    
    print(f"Interval features created: {len(interval_features)}")
    print(f"Categorical features standardized: {len(categorical_clean)}")
    print(f"Property midpoint features available: {len(property_features)}")
    
    return df_work

# Execute feature engineering
feature_engineered_df = engineer_similarity_features(enriched_with_ranges)

# Save to CSV
feature_engineered_df.to_csv('feature_engineered_df.csv', index=False)
print("\nFixed feature_engineered_df dataset has been saved as 'feature_engineered_df.csv'")


=== TASK B.2: Feature Engineering ===

1. Engineering Dimension Features...
  Created interval features for thickness
  Created interval features for width
  Created interval features for length
  Created interval features for height
  Created interval features for weight
  Created interval features for inner_diameter
  Created interval features for outer_diameter
  Created interval features for yield_strength
  Created interval features for tensile_strength

2. Engineering Categorical Features...
  Standardized coating: 63 unique values
  Standardized finish: 47 unique values
  Standardized form: 18 unique values
  Standardized surface_type: 15 unique values
  Standardized surface_protection: 16 unique values

3. Engineering Grade Property Features...

  Grade Properties Availability and Filtering (min 5% coverage):
    Carbon_C_mid: 940/941 (99.9%) kept
    Manganese_Mn_mid: 940/941 (99.9%) kept
    Silicon_Si_mid: 539/941 (57.3%) kept
    Sulfur_S_mid: 929/941 (98.7%) kept
    Phosp

In [28]:
def calculate_rfq_similarity(df):
    """
    Calculate aggregate similarity scores between RFQs.
    
    Task B.3: Define aggregate similarity score and output top-3 most similar RFQs per line.
    """
    
    print("=== TASK B.3: Similarity Calculation ===")
    
    # Get overlap functions from dataframe attributes
    interval_overlap_ratio = df.attrs['interval_overlap_ratio']
    categorical_match = df.attrs['categorical_match']
    
    # ========================================
    # 1. DEFINE FEATURE GROUPS AND WEIGHTS
    # ========================================
    
    # Dimension features (interval-based)
    dimension_features = [
        ('thickness_interval_min', 'thickness_interval_max'),
        ('width_interval_min', 'width_interval_max'),
        ('weight_interval_min', 'weight_interval_max'),
        ('yield_strength_interval_min', 'yield_strength_interval_max'),
        ('tensile_strength_interval_min', 'tensile_strength_interval_max')
    ]
    
    # Categorical features (exact match)
    categorical_features = ['coating_clean', 'finish_clean', 'form_clean', 'surface_type_clean']
    
    # Grade property features (numeric - use dense features only)
    grade_property_features = [
        'Carbon_C_mid', 'Manganese_Mn_mid', 'Sulfur_S_mid', 
        'Phosphorus_P_mid', 'Tensile_strength_Rm_mid', 
        'Yield_strength_Re_or_Rp0.2_mid', 'Elongation_A%_mid'
    ]
    
    # Feature weights (can be tuned)
    weights = {
        'dimensions': 0.3,      # 30% weight for dimensional similarity
        'categorical': 0.3,     # 30% weight for categorical matches  
        'grade_properties': 0.4 # 40% weight for grade property similarity
    }
    
    print(f"Feature groups defined:")
    print(f"  Dimensions: {len(dimension_features)} features (weight: {weights['dimensions']})")
    print(f"  Categorical: {len(categorical_features)} features (weight: {weights['categorical']})")
    print(f"  Grade properties: {len(grade_property_features)} features (weight: {weights['grade_properties']})")
    
    # ========================================
    # 2. SIMILARITY CALCULATION FUNCTIONS
    # ========================================
    
    def calculate_dimension_similarity(row1, row2):
        """Calculate dimensional similarity using interval overlap."""
        similarities = []
        
        for min_col, max_col in dimension_features:
            if min_col in df.columns and max_col in df.columns:
                overlap = interval_overlap_ratio(
                    row1[min_col], row1[max_col],
                    row2[min_col], row2[max_col]
                )
                similarities.append(overlap)
        
        return np.mean(similarities) if similarities else 0.0
    
    def calculate_categorical_similarity(row1, row2):
        """Calculate categorical similarity using exact matches."""
        matches = []
        
        for cat_feature in categorical_features:
            if cat_feature in df.columns:
                match = categorical_match(row1[cat_feature], row2[cat_feature])
                matches.append(match)
        
        return np.mean(matches) if matches else 0.0
    
    def calculate_grade_property_similarity(row1, row2):
        """Calculate grade property similarity using normalized differences."""
        similarities = []
        
        for prop_feature in grade_property_features:
            if prop_feature in df.columns:
                val1, val2 = row1[prop_feature], row2[prop_feature]
                
                if pd.notna(val1) and pd.notna(val2):
                    # Normalize by the range of the feature to get 0-1 similarity
                    feature_range = df[prop_feature].max() - df[prop_feature].min()
                    if feature_range > 0:
                        normalized_diff = abs(val1 - val2) / feature_range
                        similarity = max(0, 1 - normalized_diff)  # Convert difference to similarity
                        similarities.append(similarity)
        
        return np.mean(similarities) if similarities else 0.0
    
    def calculate_aggregate_similarity(row1, row2):
        """Calculate weighted aggregate similarity score."""
        dim_sim = calculate_dimension_similarity(row1, row2)
        cat_sim = calculate_categorical_similarity(row1, row2)
        prop_sim = calculate_grade_property_similarity(row1, row2)
        
        aggregate_score = (
            weights['dimensions'] * dim_sim +
            weights['categorical'] * cat_sim +
            weights['grade_properties'] * prop_sim
        )
        
        return aggregate_score, dim_sim, cat_sim, prop_sim
    
    # ========================================
    # 3. CALCULATE PAIRWISE SIMILARITIES
    # ========================================
    
    print(f"\nCalculating pairwise similarities for {len(df)} RFQs...")
    
    # Filter to only RFQs with IDs (exclude NaN rows from join)
    valid_df = df[df['id'].notna()].copy().reset_index(drop=True)
    print(f"Valid RFQs for similarity calculation: {len(valid_df)}")
    
    # Initialize results list
    similarity_results = []
    
    # Calculate similarities (this might take a moment for 1000x1000 comparisons)
    print("Computing similarities...")
    
    batch_size = 50  # Process in batches to show progress
    total_comparisons = len(valid_df) * (len(valid_df) - 1)
    completed = 0
    
    for i in range(0, len(valid_df), batch_size):
        end_i = min(i + batch_size, len(valid_df))
        
        for idx1 in range(i, end_i):
            row1 = valid_df.iloc[idx1]
            rfq1_id = row1['id']
            
            # Store similarities for this RFQ
            rfq_similarities = []
            
            for idx2 in range(len(valid_df)):
                if idx1 != idx2:  # Exclude self-comparison
                    row2 = valid_df.iloc[idx2]
                    rfq2_id = row2['id']
                    
                    # Skip exact matching (same grade, same dimensions, etc.)
                    if (row1['grade'] == row2['grade'] and 
                        row1['thickness_center'] == row2['thickness_center'] and
                        row1['width_center'] == row2['width_center']):
                        continue
                    
                    agg_sim, dim_sim, cat_sim, prop_sim = calculate_aggregate_similarity(row1, row2)
                    
                    rfq_similarities.append({
                        'rfq_id': rfq1_id,
                        'match_id': rfq2_id,
                        'similarity_score': agg_sim,
                        'dimension_similarity': dim_sim,
                        'categorical_similarity': cat_sim,
                        'property_similarity': prop_sim
                    })
            
            # Get top-3 most similar for this RFQ
            if rfq_similarities:
                rfq_similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
                top3 = rfq_similarities[:3]
                similarity_results.extend(top3)
            
            completed += len(valid_df) - 1
        
        # Progress update
        progress = (end_i / len(valid_df)) * 100
        print(f"  Progress: {progress:.1f}% ({end_i}/{len(valid_df)} RFQs processed)")
    
    print(f"✓ Completed similarity calculations")
    
    # ========================================
    # 4. CREATE RESULTS DATAFRAME  
    # ========================================
    
    results_df = pd.DataFrame(similarity_results)
    print(f"\nSimilarity results shape: {results_df.shape}")
    print(f"Average similarity score: {results_df['similarity_score'].mean():.3f}")
    print(f"Max similarity score: {results_df['similarity_score'].max():.3f}")
    
    # Show sample results
    print(f"\nTop 10 highest similarity pairs:")
    top_results = results_df.nlargest(10, 'similarity_score')
    print(top_results[['rfq_id', 'match_id', 'similarity_score', 'dimension_similarity', 
                      'categorical_similarity', 'property_similarity']])
    
    return results_df

# Execute similarity calculation
print("Starting similarity calculation...")
similarity_results = calculate_rfq_similarity(feature_engineered_df)

Starting similarity calculation...
=== TASK B.3: Similarity Calculation ===
Feature groups defined:
  Dimensions: 5 features (weight: 0.4)
  Categorical: 4 features (weight: 0.3)
  Grade properties: 7 features (weight: 0.3)

Calculating pairwise similarities for 1000 RFQs...
Valid RFQs for similarity calculation: 1000
Computing similarities...
  Progress: 5.0% (50/1000 RFQs processed)
  Progress: 10.0% (100/1000 RFQs processed)
  Progress: 15.0% (150/1000 RFQs processed)
  Progress: 20.0% (200/1000 RFQs processed)
  Progress: 25.0% (250/1000 RFQs processed)
  Progress: 30.0% (300/1000 RFQs processed)
  Progress: 35.0% (350/1000 RFQs processed)
  Progress: 40.0% (400/1000 RFQs processed)
  Progress: 45.0% (450/1000 RFQs processed)
  Progress: 50.0% (500/1000 RFQs processed)
  Progress: 55.0% (550/1000 RFQs processed)
  Progress: 60.0% (600/1000 RFQs processed)
  Progress: 65.0% (650/1000 RFQs processed)
  Progress: 70.0% (700/1000 RFQs processed)
  Progress: 75.0% (750/1000 RFQs process

In [33]:
# Step 6: Create Final Deliverable and Pipeline
def create_final_deliverable(similarity_results):
    """
    Create the final top3.csv deliverable as specified in Task B.3.
    Format: [rfq_id, match_id, similarity_score]
    """
    
    print("=== Creating Final Deliverable (top3.csv) ===")
    
    # Select only the required columns
    deliverable_df = similarity_results[['rfq_id', 'match_id', 'similarity_score']].copy()
    
    # Round similarity scores to 6 decimal places for clean output
    deliverable_df['similarity_score'] = deliverable_df['similarity_score'].round(6)
    
    # Sort by rfq_id first, then by similarity_score (highest first) for better organization
    deliverable_df = deliverable_df.sort_values(['rfq_id', 'similarity_score'], 
                                               ascending=[True, False])
    
    # Save to CSV
    output_path = Path("../../results/top3.csv")
    output_path.parent.mkdir(exist_ok=True)
    
    deliverable_df.to_csv(output_path, index=False)
    print(f"\n✓ Deliverable saved to: {output_path}")
    print(f"✓ File contains {len(deliverable_df)} similarity pairs")
    
    return deliverable_df

def create_summary_report(similarity_results, enriched_df):
    """Create a summary report of the RFQ similarity analysis."""
    
    print(f"\n=== FINAL SUMMARY REPORT ===")
    print(f"Task B: RFQ Similarity Analysis - COMPLETED")
    print(f"=" * 50)
    
    # Data processing summary
    print(f"\n📊 DATA PROCESSING SUMMARY:")
    print(f"   • Original RFQ records: 1,000")
    print(f"   • RFQs with reference data: {enriched_df['Grade/Material'].notna().sum()}")
    print(f"   • Grade properties parsed: 18 properties")
    print(f"   • Feature engineering: 157 total features")
    
    # Similarity analysis summary  
    print(f"\n🔍 SIMILARITY ANALYSIS SUMMARY:")
    print(f"   • Total similarity pairs calculated: {len(similarity_results):,}")
    print(f"   • Average similarity score: {similarity_results['similarity_score'].mean():.3f}")
    print(f"   • Similarity score range: {similarity_results['similarity_score'].min():.3f} - {similarity_results['similarity_score'].max():.3f}")
    
    # Component analysis
    print(f"\n⚖️ SIMILARITY COMPONENTS:")
    print(f"   • Dimension similarity avg: {similarity_results['dimension_similarity'].mean():.3f}")
    print(f"   • Categorical similarity avg: {similarity_results['categorical_similarity'].mean():.3f}")
    print(f"   • Property similarity avg: {similarity_results['property_similarity'].mean():.3f}")
    
    # Top performing pairs
    print(f"\n🏆 TOP SIMILARITY PAIRS:")
    top5 = similarity_results.nlargest(5, 'similarity_score')
    for idx, row in top5.iterrows():
        print(f"   • {row['rfq_id'][:8]}... ↔ {row['match_id'][:8]}... (score: {row['similarity_score']:.3f})")

# Execute final deliverable creation
final_deliverable = create_final_deliverable(similarity_results)

# Create summary report
create_summary_report(similarity_results, feature_engineered_df)

print(f"\n🎉 TASK B COMPLETED SUCCESSFULLY!")
print(f"   All required deliverables have been generated.")
print(f"   The similarity pipeline is ready for production use.")

=== Creating Final Deliverable (top3.csv) ===

✓ Deliverable saved to: ../../results/top3.csv
✓ File contains 3000 similarity pairs

=== FINAL SUMMARY REPORT ===
Task B: RFQ Similarity Analysis - COMPLETED

📊 DATA PROCESSING SUMMARY:
   • Original RFQ records: 1,000
   • RFQs with reference data: 941
   • Grade properties parsed: 18 properties
   • Feature engineering: 157 total features

🔍 SIMILARITY ANALYSIS SUMMARY:
   • Total similarity pairs calculated: 3,000
   • Average similarity score: 0.579
   • Similarity score range: 0.225 - 0.837

⚖️ SIMILARITY COMPONENTS:
   • Dimension similarity avg: 0.068
   • Categorical similarity avg: 0.929
   • Property similarity avg: 0.911

🏆 TOP SIMILARITY PAIRS:
   • f19fd194... ↔ 86947c4f... (score: 0.837)
   • 86947c4f... ↔ f19fd194... (score: 0.837)
   • e2cf8e2c... ↔ 03b99bdc... (score: 0.836)
   • 03b99bdc... ↔ e2cf8e2c... (score: 0.836)
   • f19fd194... ↔ 03b99bdc... (score: 0.829)

🎉 TASK B COMPLETED SUCCESSFULLY!
   All required deliver

In [34]:
# Quick check - run this in your scripts/Task_B/ directory
from data_loader import load_rfq_data
from grade_normalizer import normalize_grades
from range_parser import parse_range_strings

rfq_df, reference_df = load_rfq_data()
enriched_df, _ = normalize_grades(rfq_df, reference_df)
parsed_df = parse_range_strings(enriched_df)

categorical_features = ['coating', 'finish', 'form', 'surface_type', 'surface_protection']
existing_features = [f for f in categorical_features if f in parsed_df.columns]
print('Existing categorical features:', existing_features)
print('Total found:', len(existing_features))


=== Data Loading ===

=== TASK B.1: Grade Normalization and Reference Join ===

Grade Analysis:
Unique normalized grades in RFQ: 147
Unique normalized grades in Reference: 164
Grades found in both datasets: 147
RFQ grades missing in reference: 0
Reference grades not in RFQ: 17

Join Results:
Total RFQ records: 1000
Records with reference data: 941
Records missing reference data: 59
Enriched RFQ shape: (1000, 61)
No duplicate rows created!

=== TASK B.1: Range String Parsing ===
Parsing Carbon (C)...
  Successfully parsed 940/941 values
Parsing Manganese (Mn)...
  Successfully parsed 940/941 values
Parsing Silicon (Si)...
  Successfully parsed 539/540 values
Parsing Sulfur (S)...
  Successfully parsed 929/929 values
Parsing Phosphorus (P)...
  Successfully parsed 929/929 values
Parsing Chromium (Cr)...
  Successfully parsed 63/63 values
Parsing Nickel (Ni)...
  Successfully parsed 22/22 values
Parsing Molybdenum (Mo)...
  Successfully parsed 41/41 values
Parsing Vanadium (V)...
  Succe