### Task 2 RFQs similarity

In [457]:
import pandas as pd
import re

In [None]:
rfq_data = pd.read_csv("/Users/hyosangkim/Desktop/repo/vanilla-steel/data/task_2/rfq.csv")

In [None]:
rfq_ref = pd.read_csv("/Users/hyosangkim/Desktop/repo/vanilla-steel/data/task_2/reference_properties.tsv", sep='\t')

#### Task B.1
- Normalize the grade (uppercase, split with suffixes, aliases handling)
- Flag grade without reference.
- Parse range strings to min/max with unit
- Join reference with CSV data.

In [460]:
# normalize the grade key
def split_grade_and_suffix(grade: str):
    """
    Normalize the grade by casing, split by non-alphanumeric suffixes.
    Returns (base, suffix).
    """
    if not isinstance(grade, str) or not grade.strip():
        return None, None
    
    grade = grade.upper()
    split_symbols = {ch for ch in grade if not ch.isalnum() and ch != "."}
    if not split_symbols:
        return grade, None
    pattern = "[" + re.escape("".join(split_symbols)) + "]"
    parts = re.split(pattern, grade, maxsplit=1)
    base = parts[0]
    suffixes = "".join([p for p in parts[1:] if p])
    return base, suffixes

rfq_data[["grade", "grade_suffix"]] = rfq_data["grade"].apply(
    lambda x: pd.Series(split_grade_and_suffix(x)) # Series to assign base and suffix right away
)

rfq_ref[["grade", "grade_suffix"]] = rfq_ref["Grade/Material"].apply(
    lambda x: pd.Series(split_grade_and_suffix(x))
)

In [461]:
# Flag the grade without reference
rfq_data["grade_joined"] = rfq_data["grade"].fillna("") + rfq_data["grade_suffix"].fillna("")
rfq_ref["grade_joined"] = rfq_ref["grade"].fillna("") + rfq_ref["grade_suffix"].fillna("")
grades_from_ref = set(rfq_ref["grade_joined"].dropna())
rfq_data["missing_in_ref"] = ~rfq_data["grade_joined"].isin(grades_from_ref)

In [462]:
# Collect unique missing grades and check what are they if any.
unique_missing = set(rfq_data.loc[rfq_data["missing_in_ref"], "grade_joined"])
print("Unique missing grades:", unique_missing) 

Unique missing grades: {''}


In [463]:
# Columns to exclude when parsing the range based values. 
columns_except = [
    "Grade/Material",
    "grade",
    "grade_suffix", 
    "grade_joined",
    "UNS_No", 
    "Steel_No",
    "Standards",
    "Impact toughness (Charpy V-notch)",
    "Fatigue limit",
    "Creep resistance",
    "Source_Pages",
    "Application",
    "Category",
    "Coating"
]

In [464]:
# Dataframe to parse the range values.
rfq_ref_ranges=rfq_ref.drop(columns=columns_except)

In [465]:
# Inspect the types of range values in the dataframe.  
all_text = "".join(rfq_ref_ranges.astype(str).fillna("").values.ravel()) # dataframe values into 2D and then 1D flattening
range_symbols = {ch for ch in all_text if not ch.isalnum()}
range_symbols

{' ', '%', '(', ')', '-', '.', '×', '≤', '≥'}

In [466]:
# Range value containing values' column names
range_columns = list(rfq_ref_ranges.columns)

In [467]:
def parse_range_with_unit(value: str):
    """
    Parse range like '150 - 440 MPa', '>=280 MPa', '≤20 mm'. 
    Returns tuple (min, max, unit).
    """
    if pd.isna(value):
        return (None, None, None)

    text = str(value).strip()
    # Extract unit. 
    unit_match = re.search(r"([a-zA-Z%]+)$", text)
    unit = unit_match.group(1) if unit_match else None

    # Remove unit for parsing
    numeric_text = re.sub(r"[a-zA-Z%]", "", text).replace(" ", "")

    min_val, max_val = None, None

    # Match range "150-440"
    range_match = re.match(r"^(\d+(?:\.\d+)?)[-–](\d+(?:\.\d+)?)$", numeric_text)
    if range_match:
        min_val, max_val = float(range_match.group(1)), float(range_match.group(2))

    # Match greater than e.g., >=280 MPa
    elif re.match(r"^≥?(\d+(?:\.\d+)?)$", numeric_text):
        min_val = float(re.findall(r"\d+(?:\.\d+)?", numeric_text)[0])

    elif re.match(r"^>(\d+(?:\.\d+)?)$", numeric_text):
        min_val = float(re.findall(r"\d+(?:\.\d+)?", numeric_text)[0])

    # Match less than e.g., <=20
    elif re.match(r"^≤?(\d+(?:\.\d+)?)$", numeric_text):
        max_val = float(re.findall(r"\d+(?:\.\d+)?", numeric_text)[0])

    elif re.match(r"^<(\d+(?:\.\d+)?)$", numeric_text):
        max_val = float(re.findall(r"\d+(?:\.\d+)?", numeric_text)[0])

    # Single number
    elif re.match(r"^(\d+(?:\.\d+)?)$", numeric_text):
        val = float(numeric_text)
        min_val, max_val = val, val

    return (min_val, max_val, unit)

In [468]:
def parse_columns_ranges(df: pd.DataFrame, columns: list):
    for col in columns:
        new_columns = [f"{col}_min", f"{col}_max", f"{col}_unit"]
        df[new_columns] = df[col].apply(
            lambda x: pd.Series(parse_range_with_unit(x))
        )
    return df

In [469]:
# Parse the range valued dataframe
rfq_ref_parsed = parse_columns_ranges(rfq_ref_ranges, range_columns)

In [470]:
# Parse the column names for lower casing, replace the whitespace with underscore.
rfq_ref_parsed.columns = (
    rfq_ref_parsed.columns
    .str.lower()
    .str.replace(" ", "_")
)

# Drop the columns that doesn't have the suffixes min, max, unit - columns with ranging values.
rfq_ref_parsed = rfq_ref_parsed[[c for c in rfq_ref_parsed.columns if c.endswith(("_min", "_max", "_unit"))]]

# Non range value columns. 
rfq_ref_meta = rfq_ref[[c for c in columns_except]]
rfq_ref_meta.columns = (
    rfq_ref_meta.columns
    .str.lower()
    .str.replace(" ", "_")
)

rfq_ref_meta = rfq_ref_meta.drop(columns=['grade/material', 'grade_suffix', 'grade']) # this has been dealt to the grade, grade_suffix,and grade_join. 


In [471]:
# Concatenate the meta data and the previously range valued data.
rfq_ref_b1 = pd.concat([rfq_ref_meta, rfq_ref_parsed], axis=1)

In [472]:
joined_rfq = pd.merge(rfq_data, rfq_ref_b1, on="grade_joined", how="left", suffixes=("_rfq", "_ref"))

After join
1. Drop rows without the identifiers or grade.
2. Keep-null for the categorical values.
3. Impute the min, max values if one is missing, then paste the other values.

In [473]:
# Handle the null columns except min, max columns
columns_remain  = [c for c in rfq_ref_parsed.columns if c.endswith(("_min", "_max"))]
columns_to_check = [c for c in joined_rfq.columns if c not in columns_remain]

# Normalize the empty strings to NA values.
joined_rfq = joined_rfq.replace("", pd.NA)

for col in columns_to_check:
    if col not in columns_remain and joined_rfq[col].isna().all():
        joined_rfq = joined_rfq.drop(columns=col)
        
# All columns have the id but not all have grade. flag the rows without the grade.
joined_rfq['missing_grade'] = joined_rfq['grade'].isna()

In [474]:
# Impute the rows with only min or max values.
range_feature = {c.rsplit("_", 1)[0] for c in joined_rfq.columns if c.endswith(("_min", "_max"))}
range_feature = list(range_feature)

for feat in range_feature:
    min_col, max_col = f"{feat}_min", f"{feat}_max"
    if min_col in joined_rfq.columns and max_col in joined_rfq.columns: # there are some feature with only _min or only _max. 
        joined_rfq[min_col] = joined_rfq[min_col].fillna(joined_rfq[max_col])
        joined_rfq[max_col] = joined_rfq[max_col].fillna(joined_rfq[min_col])

  joined_rfq[min_col] = joined_rfq[min_col].fillna(joined_rfq[max_col])
  joined_rfq[max_col] = joined_rfq[max_col].fillna(joined_rfq[min_col])


#### Task B.2
1. Represent the dimensions into intervals and calculate the overlapping ratios.
2. Define the similarity as exact matches in categorical values
3. Represent the grade properties by the midpoints of the ranges in chemical features. 

In [None]:
# Get features to calculate the dimensions overlapping
dimension_features_with_ranges = ['thickness_min', 'thickness_max',
       'width_min', 'width_max', 'length_min', 'height_min', 'height_max',
       'weight_min', 'weight_max', 'inner_diameter_min', 'inner_diameter_max',
       'outer_diameter_min', 'outer_diameter_max', 'yield_strength_min',
       'yield_strength_max', 'tensile_strength_min', 'tensile_strength_max']
dimension_features = list({feat.rsplit("_", 1)[0] for feat in dimension_features_with_ranges})

In [476]:
rfq_dimensions = joined_rfq[dimension_features_with_ranges].copy()

In [477]:
for feat in dimension_features:
    min_feat, max_feat = f"{feat}_min", f"{feat}_max"
    if min_feat in rfq_dimensions.columns and max_feat in rfq_dimensions.columns: 
        rfq_dimensions[f"{feat}_interval"] = rfq_dimensions.apply(lambda row: (row[min_feat], row[max_feat]), axis=1)
    elif min_feat in rfq_dimensions.columns:
        rfq_dimensions[f"{feat}_interval"] = rfq_dimensions[min_feat].apply(lambda v: (v, v))
    elif max_feat in rfq_dimensions.columns:
        rfq_dimensions[f"{feat}_interval"] = rfq_dimensions[max_feat].apply(lambda v: (v, v))

In [478]:
# Suggestion: calculate the Intersection over Union (IoU)
def calculate_iou(interval1: tuple, interval2: tuple):
    a1, a2 = interval1
    b1, b2 = interval2

    if pd.isna(a1) or pd.isna(a2) or pd.isna(b1) or pd.isna(b2):
        return None
    else:
        intersection = max(0, min(a2, b2) - max(a1,b1))
        union = max(a2, b2) - min(a1, b1)

        return intersection / union if union > 0 else 0

The dataframe rfq_dimensions represent the interval dimensions of the grade from buyer's request, and the Intersection over Union can be calculated using the function 'calculate_iou'. 

In [None]:
# Similarity definition
df_categorical_similarity = joined_rfq[['coating_rfq', 'finish', 'surface_type', 'form']].copy()

def calculate_similarity_in_categoricals(value1: str, value2: str):
    if pd.isna(value1) or pd.isna(value2):
        return None
    else:
        return int(value1 == value2)
    
categorical_features = list(df_categorical_similarity.columns)

def rfq_categorical_similarity(row1, row2, features=categorical_features):
    scores = []
    for feat in features:
        score = calculate_similarity_in_categoricals(row1[feat], row2[feat])
        if score is not None:
            scores.append(score)
    return sum(scores) / len(scores) if scores else None

The function rfq_categorical_similarity will calculate the scores across the different categorical features and calculate the averaged similarity between the RFQs.

In [None]:
# Grade properties 
range_feature = list(set(joined_rfq.columns) - set(dimension_features_with_ranges)) # features that were originally from reference, with range values.
range_feature = list({feat.rsplit("_", 1)[0] for feat in range_feature})
for feat in range_feature:
    min_feat, max_feat = f"{feat}_min", f"{feat}_max"
    mid_feat = f"{feat}_mid"
    if min_feat in joined_rfq.columns and max_feat in joined_rfq.columns:
        joined_rfq[mid_feat] = joined_rfq[[min_feat, max_feat]].mean(axis=1)

# Get the properties columns.
mid_cols = [c for c in joined_rfq.columns if c.endswith("_mid")]
sparsity = joined_rfq[mid_cols].notna().mean()

# Drop features with <10% coverage - Strictly remove the columns.
sparse_cols = sparsity[sparsity < 0.1].index
joined_rfq = joined_rfq.drop(columns=sparse_cols)

Grade properties, espeically the chemical compositions are represented with numerical values, a midpoints of the ranges provided from the reference. 

#### Task B.3 Similarity Calculation 