In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

In [64]:
RESOURCES_DIR = Path("../resources/resources")
TASK1_DIR = RESOURCES_DIR / "task_1"
TASK2_DIR = RESOURCES_DIR / "task_2"

In [66]:
# Step 1: Check if data files exist
supplier1_path = TASK1_DIR / "supplier_data_1.xlsx"
supplier2_path = TASK1_DIR / "supplier_data_2.xlsx"

print(f"Checking for data files...")
print(f"Looking in: {TASK1_DIR}")
print(f"supplier_data_1.xlsx exists: {supplier1_path.exists()}")
print(f"supplier_data_2.xlsx exists: {supplier2_path.exists()}")

Checking for data files...
Looking in: ../resources/resources/task_1
supplier_data_1.xlsx exists: True
supplier_data_2.xlsx exists: True


In [67]:
# Step 2: Load and examine supplier_data1.xlsx

supplier1_df = pd.read_excel(supplier1_path)
supplier1_df.head()

Unnamed: 0,Werksgüte,Bestellgütentext,Nenndicke NNN.NN mm mit Dezimalpunkt,Breite,Länge,Gewicht (kg),Cluster,Si-Gehalt,Mn-Gehalt,P-Gehalt,...,Mo-Gehalt,V-Gehalt,Cu-Gehalt,Nb-Gehalt,Ti-Gehalt,Al-Gehalt,B-Gehalt,Streckgrenze,Zugfestigkeit,Dehnung
0,G2UB5,SZBS800,320,856.0,787.0,16.49,WB-G,,,,...,,,,,,,,,,
1,G2UJ5,SZBS800,339,918.0,707.0,17.16,WB-G,,,,...,,,,,,,,,,
2,C3318,LICRO 500,452,1839.0,300.0,18.7,WB-U,0.254,1.278,0.008,...,0.009,20.0,290.0,12.0,320.0,320.0,25.0,0.0,0.0,0.0
3,C3U15,S380MC mod. 4,532,1160.0,461.0,22.011,WB-U,0.225,1.063,0.01,...,0.006,30.0,150.0,320.0,989.0,430.0,1.0,0.0,0.0,0.0
4,G3UB5,SZBE800,451,727.2,557.0,14.02,Spaltband,,,,...,,,,,,,,,,


In [41]:
print(f"Shape: {supplier1_df.shape}")

Shape: (86, 23)


In [None]:
print(f"\nData types:")
print(supplier1_df.dtypes)


Data types:
Werksgüte                                object
Bestellgütentext                         object
Nenndicke NNN.NN mm mit Dezimalpunkt      int64
Breite                                   object
Länge                                   float64
Gewicht (kg)                            float64
Cluster                                  object
Si-Gehalt                                object
Mn-Gehalt                                object
P-Gehalt                                 object
S-Gehalt                                 object
Cr-Gehalt                                object
Ni-Gehalt                                object
Mo-Gehalt                                object
V-Gehalt                                float64
Cu-Gehalt                               float64
Nb-Gehalt                               float64
Ti-Gehalt                               float64
Al-Gehalt                               float64
B-Gehalt                                float64
Streckgrenze               

In [36]:
print(f"\nMissing values:")
print(supplier1_df.isnull().sum())


Missing values:
Werksgüte                               20
Bestellgütentext                        16
Nenndicke NNN.NN mm mit Dezimalpunkt     0
Breite                                   0
Länge                                    0
Gewicht (kg)                             0
Cluster                                 12
Si-Gehalt                               31
Mn-Gehalt                               31
P-Gehalt                                31
S-Gehalt                                38
Cr-Gehalt                               31
Ni-Gehalt                               39
Mo-Gehalt                               38
V-Gehalt                                38
Cu-Gehalt                               38
Nb-Gehalt                               38
Ti-Gehalt                               38
Al-Gehalt                               38
B-Gehalt                                38
Streckgrenze                            38
Zugfestigkeit                           38
Dehnung                              

In [37]:
print(f"\nSample values per column:")
for col in supplier1_df.columns:
    unique_vals = supplier1_df[col].dropna().unique()[:5]  # First 5 non-null unique values
    print(f"{col}: {unique_vals}")


Sample values per column:
Werksgüte: ['G2UB5' 'G2UJ5' 'C3318' 'C3U15' 'G3UB5']
Bestellgütentext: ['SZBS800' 'LICRO 500' 'S380MC mod. 4' 'SZBE800' '2A Lager']
Nenndicke NNN.NN mm mit Dezimalpunkt: [320 339 452 532 451]
Breite: ['856.0' '918.0' '1,839.0' '1,160.0' '727.2']
Länge: [787. 707. 300. 461. 557.]
Gewicht (kg): [16.49  17.16  18.7   22.011 14.02 ]
Cluster: ['WB-G' 'WB-U' 'Spaltband' 'ELO verzinkt' 'WBU']
Si-Gehalt: ['0.2540' '0.2250' '10' '0.2330' '0.1190']
Mn-Gehalt: ['1.2780' '1.0630' 'A' '1.0290' '0.7440']
P-Gehalt: ['0.0080' '0.0100' '610' '0.0090' '0.0130']
S-Gehalt: ['0.0010' '0.0020' 'technologische Werte (WBB)' '0.0040' '0']
Cr-Gehalt: ['0.4340' '0.0480' 'Standstelle/Verfärbungen (KB2)' '0.0600' '0']
Ni-Gehalt: ['0.0350' '0.0300' 'zu dick / Überdicke (WBB)' '0.0630' '0.0390']
Mo-Gehalt: ['0.0090' '0.0060'
 'Erledigen von Materialfehlern zu Material: (2336173004) Typ: (RG) Fehl'
 '0.0180' '0.0050']
V-Gehalt: [ 20.  30. 638.  10.   0.]
Cu-Gehalt: [290. 150. 776. 500. 780.

In [None]:
# Step 3: Load and examine supplier_data2.xlsx

supplier2_df = pd.read_excel(supplier2_path)
supplier2_df.head()

Unnamed: 0,PRODUCT_TYPE,ORDER_ID,SITE,MATERIAL_NAME,MATERIAL_NUMBER,MATERIAL_QUALITY_NORM,SURFACE_COATING,DEFECT_NOTES,NOMINAL_THICKNESS_MM,WIDTH_MM,...,HEIGHT_MM,MASS_MIN_KG,NUMBER_OF_COILS,DELIVERY_EARLIEST,DELIVERY_LATEST,INCO_TERM,BUY_NOW_EUR_PER_TON,MIN/MAX_BID_EUR_PER_TON,CO2_PER_TON_MAX_KG,VALID_UNTIL
0,SHEET,436765,1 company gmbh,S235JR,1.0038,DIN EN 10025,,DEKL-S235JR / D2A EID,11.859,1509,...,,2091,,,,FCA,600.0,,,20/02/2025 11:00
1,SHEET,436754,1 company gmbh,S355MC,1.0976,DIN EN 10149,,,8.057,1011,...,,2411,,,,FCA,600.0,,,20/02/2025 11:00
2,SHEET,436755,1 company gmbh,,1.0976,DIN EN 10149,,DEKL-S355MC / D2A WEH,8.057,1010,...,,2251,,,,FKA,600.0,,,2025-02-20 11:00
3,SHEET,436757,1 company gmbh,S355MC,1.0976,,,DEKL-S355MC / D2A WEH,8.057,1011,...,,2401,,,,FCA,600.0,,,2025-02-20 11:00
4,SHEET,436758,1 company gmbh,S355MC,1.0976,DN EN 10149,,DEKL-S355MC / D2A WEH,8.057,1010,...,,2401,,,,FCA,600.0,,,2025-02-20 11:00


In [68]:
print(f"Shape: {supplier2_df.shape}")

Shape: (136, 21)


In [69]:
print(f"\nData types:")
print(supplier2_df.dtypes)


Data types:
PRODUCT_TYPE                object
ORDER_ID                     int64
SITE                        object
MATERIAL_NAME               object
MATERIAL_NUMBER            float64
MATERIAL_QUALITY_NORM       object
SURFACE_COATING            float64
DEFECT_NOTES                object
NOMINAL_THICKNESS_MM       float64
WIDTH_MM                     int64
LENGTH_MM                  float64
HEIGHT_MM                  float64
MASS_MIN_KG                  int64
NUMBER_OF_COILS            float64
DELIVERY_EARLIEST          float64
DELIVERY_LATEST            float64
INCO_TERM                   object
BUY_NOW_EUR_PER_TON        float64
MIN/MAX_BID_EUR_PER_TON    float64
CO2_PER_TON_MAX_KG         float64
VALID_UNTIL                 object
dtype: object


In [70]:
print(f"\nMissing values:")
print(supplier2_df.isnull().sum())


Missing values:
PRODUCT_TYPE                18
ORDER_ID                     0
SITE                         0
MATERIAL_NAME               20
MATERIAL_NUMBER             45
MATERIAL_QUALITY_NORM       17
SURFACE_COATING            136
DEFECT_NOTES                20
NOMINAL_THICKNESS_MM         0
WIDTH_MM                     0
LENGTH_MM                   81
HEIGHT_MM                  136
MASS_MIN_KG                  0
NUMBER_OF_COILS            136
DELIVERY_EARLIEST          136
DELIVERY_LATEST            136
INCO_TERM                   28
BUY_NOW_EUR_PER_TON         97
MIN/MAX_BID_EUR_PER_TON     11
CO2_PER_TON_MAX_KG         136
VALID_UNTIL                  0
dtype: int64


In [71]:
print(f"\nSample values per column:")
for col in supplier2_df.columns:
    unique_vals = supplier2_df[col].dropna().unique()[:5]  # First 5 non-null unique values
    print(f"{col}: {unique_vals}")


Sample values per column:
PRODUCT_TYPE: ['SHEET' 'COIL_STRIP' 'SHET' 'COILS_STRIP']
ORDER_ID: [436765 436754 436755 436757 436758]
SITE: ['1 company gmbh']
MATERIAL_NAME: ['S235JR' 'S355MC' 'S355JR' 'DC06' 'S460MC']
MATERIAL_NUMBER: [1.0038 1.0976 1.0045 1.0873 1.033 ]
MATERIAL_QUALITY_NORM: ['DIN EN 10025' 'DIN EN 10149' 'DN EN 10149' 'DIN EN 10130' 'DN EN 10025']
SURFACE_COATING: []
DEFECT_NOTES: ['DEKL-S235JR  / D2A EID' 'DEKL-S355MC  / D2A WEH'
 'DEKL-S355JR  / D2A KRA' 'DC06  / D2A ZUB' 'DC06  / D2A UKK']
NOMINAL_THICKNESS_MM: [11.859  8.057 12.008  0.8    0.652]
WIDTH_MM: [1509 1011 1010 1508 1385]
LENGTH_MM: [3008. 2355. 2356. 2506. 2005.]
HEIGHT_MM: []
MASS_MIN_KG: [2091 2411 2251 2401 3541]
NUMBER_OF_COILS: []
DELIVERY_EARLIEST: []
DELIVERY_LATEST: []
INCO_TERM: ['FCA' 'FKA']
BUY_NOW_EUR_PER_TON: [600. 640.]
MIN/MAX_BID_EUR_PER_TON: [570. 610. 620. 550. 490.]
CO2_PER_TON_MAX_KG: []
VALID_UNTIL: ['20/02/2025 11:00' '2025-02-20 11:00' '2025-02-25 16:00'
 '25/02/2025 16:00' '202