In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

In [2]:
RESOURCES_DIR = Path("../resources")
TASK1_DIR = RESOURCES_DIR / "task_1"
TASK2_DIR = RESOURCES_DIR / "task_2"

In [3]:
# Step 1: Check if data files exist
rfq_path = TASK2_DIR / "rfq.csv"
reference_path = TASK2_DIR / "reference_properties.tsv"

print(f"Checking for data files...")
print(f"Looking in: {TASK2_DIR}")
print(f"rfq.csv exists: {rfq_path.exists()}")
print(f"reference_properties.tsv exists: {reference_path.exists()}")

Checking for data files...
Looking in: ../resources/task_2
rfq.csv exists: True
reference_properties.tsv exists: True


In [6]:
# Step 2: Load and examine rfq.csv

rfq_df = pd.read_csv(rfq_path)
pd.set_option("display.max_columns", None)  # show all columns
pd.set_option("display.width", None)       # don't wrap columns
rfq_df.head()

Unnamed: 0,id,grade,grade_suffix,coating,finish,surface_type,surface_protection,form,thickness_min,thickness_max,width_min,width_max,length_min,height_min,height_max,weight_min,weight_max,inner_diameter_min,inner_diameter_max,outer_diameter_min,outer_diameter_max,yield_strength_min,yield_strength_max,tensile_strength_min,tensile_strength_max
0,8aff426d-b8c0-43aa-ad26-835ef4de6129,S700MC,,,Oiled,,,Coils,6.0,6.0,600.0,1520.0,,,,15000.0,25000.0,610.0,610.0,,,,,760.0,810.0
1,37e624be-b125-464f-85b6-1838530193ef,S250GD,,ZM310,Hot-dip zinc magnesium (+ZM),,,Slit Coils,1.5,1.5,327.0,327.0,,,,,,,,,,,,,
2,b8257184-6307-46ab-b06e-d979336d1263,DX51D,,Z100,Hot-dip Galvanized (+Z/+GI),,Lightly Oiled (L),Coils,0.4,0.4,1000.0,1500.0,,,,,,,,,,,,,
3,63140d1f-dda8-40fe-8931-bcaba65d5772,S235,,,,,,Round Tubes,1.5,1.5,,,4900.0,4900.0,,53800.0,53800.0,,,60.3,,,,,
4,11cffc57-44be-4d79-bfd5-97482be566d3,S235,,,,,,Round Tubes,1.5,1.5,,,6100.0,6100.0,,14500.0,14500.0,,,48.3,,,,,


In [7]:
print(f"Shape: {rfq_df.shape}")

Shape: (1000, 25)


In [8]:
print(f"\nData types:")
print(rfq_df.dtypes)


Data types:
id                       object
grade                    object
grade_suffix            float64
coating                  object
finish                   object
surface_type             object
surface_protection       object
form                     object
thickness_min           float64
thickness_max           float64
width_min               float64
width_max               float64
length_min              float64
height_min              float64
height_max              float64
weight_min              float64
weight_max              float64
inner_diameter_min      float64
inner_diameter_max      float64
outer_diameter_min      float64
outer_diameter_max      float64
yield_strength_min      float64
yield_strength_max      float64
tensile_strength_min    float64
tensile_strength_max    float64
dtype: object


In [9]:
print(f"\nMissing values:")
print(rfq_df.isnull().sum())


Missing values:
id                         0
grade                     59
grade_suffix            1000
coating                  682
finish                   335
surface_type             865
surface_protection       713
form                      73
thickness_min            167
thickness_max            166
width_min                461
width_max                348
length_min               869
height_min               868
height_max               998
weight_min               607
weight_max               369
inner_diameter_min       823
inner_diameter_max       822
outer_diameter_min       977
outer_diameter_max       980
yield_strength_min       964
yield_strength_max       984
tensile_strength_min     951
tensile_strength_max     949
dtype: int64


In [10]:
print("\nBasic statistics:")
print(rfq_df.describe(include='all'))


Basic statistics:
                                          id  grade  grade_suffix coating  \
count                                   1000    941           0.0     318   
unique                                  1000    158           NaN      62   
top     8aff426d-b8c0-43aa-ad26-835ef4de6129  DX51D           NaN    Z275   
freq                                       1     95           NaN      40   
mean                                     NaN    NaN           NaN     NaN   
std                                      NaN    NaN           NaN     NaN   
min                                      NaN    NaN           NaN     NaN   
25%                                      NaN    NaN           NaN     NaN   
50%                                      NaN    NaN           NaN     NaN   
75%                                      NaN    NaN           NaN     NaN   
max                                      NaN    NaN           NaN     NaN   

                             finish  \
count            

In [11]:
print("Unique values per column:")
for col in rfq_df.columns:
    uniques = rfq_df[col].unique()[:5]  # first 5 unique values
    print(f"{col}: {rfq_df[col].nunique()} unique values")
    print(f"  First 5 unique values: {list(uniques)}")

Unique values per column:
id: 1000 unique values
  First 5 unique values: ['8aff426d-b8c0-43aa-ad26-835ef4de6129', '37e624be-b125-464f-85b6-1838530193ef', 'b8257184-6307-46ab-b06e-d979336d1263', '63140d1f-dda8-40fe-8931-bcaba65d5772', '11cffc57-44be-4d79-bfd5-97482be566d3']
grade: 158 unique values
  First 5 unique values: ['S700MC', 'S250GD', 'DX51D', 'S235', 'S355MC']
grade_suffix: 0 unique values
  First 5 unique values: [np.float64(nan)]
coating: 62 unique values
  First 5 unique values: [nan, 'ZM310', 'Z100', 'Z075', 'Z080']
finish: 47 unique values
  First 5 unique values: ['Oiled', 'Hot-dip zinc magnesium (+ZM)', 'Hot-dip Galvanized (+Z/+GI)', nan, 'Galvanized']
surface_type: 14 unique values
  First 5 unique values: [nan, 'Conventional surface (A)', 'Minimized spangle, improved surface (MB)', 'Minimized spangle, conventional surface (MA)', 'Non-specified']
surface_protection: 15 unique values
  First 5 unique values: [nan, 'Lightly Oiled (L)', 'Oiled (O)', 'Phosphated, chemical

In [12]:
# Step 3: Load and examine supplier_data2.xlsx
reference_df = pd.read_csv(reference_path, sep='\t')
reference_df.head()

Unnamed: 0,Grade/Material,UNS_No,Steel_No,Standards,Carbon (C),Manganese (Mn),Silicon (Si),Sulfur (S),Phosphorus (P),Chromium (Cr),Nickel (Ni),Molybdenum (Mo),Vanadium (V),Tungsten (W),Cobalt (Co),Copper (Cu),Aluminum (Al),Titanium (Ti),Niobium (Nb),Boron (B),Nitrogen (N),Tensile strength (Rm),Yield strength (Re or Rp0.2),Elongation (A%),Reduction of area (Z%),"Hardness (HB, HV, HRC)",Impact toughness (Charpy V-notch),Fatigue limit,Creep resistance,Source_Pages,Application,Category,Nb + V + Ti (Others),Coating
0,S235JR,,,EN 10025-2:2019,≤0.17,≤1.40,≤0.40,≤0.035,≤0.035,,,,,,,,,,,,≤0.012,360-510 MPa,≥235 MPa,≥26%,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
1,S275JR,,,EN 10025-2:2019,≤0.21,≤1.50,≤0.40,≤0.035,≤0.035,,,,,,,,,,,,≤0.012,410-560 MPa,≥275 MPa,≥23%,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
2,S355JR,,,EN 10025-2:2019,≤0.24,≤1.60,≤0.55,≤0.035,≤0.035,,,,,,,,,,,,≤0.012,470-630 MPa,≥355 MPa,≥22%,,,27J at 20°C,,,Standard Specifications,General structural steels,Structural Steel,,
3,S420M,,,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,≤0.30,≤0.50,≤0.10,,,,,,,,,≤0.015,520-680 MPa,≥420 MPa,≥19%,,,27J at -20°C,,,Standard Specifications,Thermomechanically rolled steels,High Strength Steel,,
4,S460M,,,EN 10025-3:2019,≤0.20,1.00-1.70,≤0.50,≤0.025,≤0.030,≤0.30,≤0.50,≤0.10,,,,,,,,,≤0.015,540-720 MPa,≥460 MPa,≥17%,,,27J at -20°C,,,Standard Specifications,Thermomechanically rolled steels,High Strength Steel,,


In [14]:
print(f"Shape: {reference_df.shape}")

Shape: (175, 34)


In [15]:
print(f"\nData types:")
print(reference_df.dtypes)


Data types:
Grade/Material                        object
UNS_No                               float64
Steel_No                             float64
Standards                             object
Carbon (C)                            object
Manganese (Mn)                        object
Silicon (Si)                          object
Sulfur (S)                            object
Phosphorus (P)                        object
Chromium (Cr)                         object
Nickel (Ni)                           object
Molybdenum (Mo)                       object
Vanadium (V)                          object
Tungsten (W)                         float64
Cobalt (Co)                          float64
Copper (Cu)                           object
Aluminum (Al)                         object
Titanium (Ti)                         object
Niobium (Nb)                          object
Boron (B)                             object
Nitrogen (N)                          object
Tensile strength (Rm)                 obje

In [16]:
print(f"\nMissing values:")
print(reference_df.isnull().sum())


Missing values:
Grade/Material                         0
UNS_No                               175
Steel_No                             175
Standards                              0
Carbon (C)                             0
Manganese (Mn)                         0
Silicon (Si)                          41
Sulfur (S)                            12
Phosphorus (P)                        12
Chromium (Cr)                        133
Nickel (Ni)                          159
Molybdenum (Mo)                      145
Vanadium (V)                         150
Tungsten (W)                         175
Cobalt (Co)                          175
Copper (Cu)                          174
Aluminum (Al)                         81
Titanium (Ti)                        146
Niobium (Nb)                         149
Boron (B)                            167
Nitrogen (N)                         158
Tensile strength (Rm)                  0
Yield strength (Re or Rp0.2)          11
Elongation (A%)                       21

In [17]:
print("\nBasic statistics:")
print(reference_df.describe(include='all'))


Basic statistics:
       Grade/Material  UNS_No  Steel_No                 Standards Carbon (C)  \
count             175     0.0       0.0                       175        175   
unique            175     NaN       NaN                        25         53   
top            S235JR     NaN       NaN  Automotive specification      ≤0.12   
freq                1     NaN       NaN                        67         52   
mean              NaN     NaN       NaN                       NaN        NaN   
std               NaN     NaN       NaN                       NaN        NaN   
min               NaN     NaN       NaN                       NaN        NaN   
25%               NaN     NaN       NaN                       NaN        NaN   
50%               NaN     NaN       NaN                       NaN        NaN   
75%               NaN     NaN       NaN                       NaN        NaN   
max               NaN     NaN       NaN                       NaN        NaN   

       Manganese (Mn

In [18]:
print("Unique values per column:")
for col in reference_df.columns:
    uniques = reference_df[col].unique()[:5]  # first 5 unique values
    print(f"{col}: {reference_df[col].nunique()} unique values")
    print(f"  First 5 unique values: {list(uniques)}")

Unique values per column:
Grade/Material: 175 unique values
  First 5 unique values: ['S235JR', 'S275JR', 'S355JR', 'S420M', 'S460M']
UNS_No: 0 unique values
  First 5 unique values: [np.float64(nan)]
Steel_No: 0 unique values
  First 5 unique values: [np.float64(nan)]
Standards: 25 unique values
  First 5 unique values: ['EN 10025-2:2019', 'EN 10025-3:2019', 'EN 10149-2:2013', 'EN 10130:2006', 'EN 10111:2008']
Carbon (C): 53 unique values
  First 5 unique values: ['≤0.17', '≤0.21', '≤0.24', '≤0.20', '≤0.12']
Manganese (Mn): 50 unique values
  First 5 unique values: ['≤1.40', '≤1.50', '≤1.60', '1.00-1.70', '≤1.70']
Silicon (Si): 19 unique values
  First 5 unique values: ['≤0.40', '≤0.55', '≤0.50', '≤0.60', nan]
Sulfur (S): 9 unique values
  First 5 unique values: ['≤0.035', '≤0.025', '≤0.015', '≤0.045', '≤0.030']
Phosphorus (P): 12 unique values
  First 5 unique values: ['≤0.035', '≤0.030', '≤0.025', '≤0.045', '≤0.120']
Chromium (Cr): 23 unique values
  First 5 unique values: [nan, '≤0