In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

In [16]:
RESOURCES_DIR = Path("../resources")
TASK1_DIR = RESOURCES_DIR / "task_1"
TASK2_DIR = RESOURCES_DIR / "task_2"

In [17]:
# Step 1: Check if data files exist
supplier1_path = TASK1_DIR / "supplier_data1.xlsx"
supplier2_path = TASK1_DIR / "supplier_data2.xlsx"

print(f"Checking for data files...")
print(f"Looking in: {TASK1_DIR}")
print(f"supplier_data1.xlsx exists: {supplier1_path.exists()}")
print(f"supplier_data2.xlsx exists: {supplier2_path.exists()}")

Checking for data files...
Looking in: ../resources/task_1
supplier_data1.xlsx exists: True
supplier_data2.xlsx exists: True


In [18]:
# Step 2: Load and examine supplier_data1.xlsx

supplier1_df = pd.read_excel(supplier1_path)
supplier1_df.head()

Unnamed: 0,Quality/Choice,Grade,Finish,Thickness (mm),Width (mm),Description,Gross weight (kg),RP02,RM,Quantity,AG,AI
0,3rd,C200S,gebeizt und geglüht,2.77,1100,Längs- oder Querisse,13983,333.6,606.2,0.0,16.11,0.0054
1,3rd,C300S,ungebeizt,2.65,1075,Längs- oder Querisse,13047,717.7,0.0,0.0,16.11,0.0046
2,3rd,C100S,gebeizt und geglüht,2.2,1100,Kantenfehler - FS-Kantenrisse,14155,368.9,0.0,10.84,0.0,0.0061
3,2nd,C100S,gebeizt,2.86,1100,Längs- oder Querisse,11381,368.9,601.7,22.87,0.0,0.0062
4,1st,C300S,ungebeizt,2.88,1050,Sollmasse (Gewicht) unterschritten,10072,0.0,1213.0,22.87,0.0,0.0041


In [19]:
print(f"Shape: {supplier1_df.shape}")

Shape: (50, 12)


In [20]:
print(f"\nData types:")
print(supplier1_df.dtypes)


Data types:
Quality/Choice        object
Grade                 object
Finish                object
Thickness (mm)       float64
Width (mm)             int64
Description           object
Gross weight (kg)      int64
RP02                 float64
RM                   float64
Quantity             float64
AG                   float64
AI                   float64
dtype: object


In [26]:
print(f"\nMissing values:")
print(supplier1_df.isnull().sum())


Missing values:
Quality/Choice       0
Grade                0
Finish               0
Thickness (mm)       0
Width (mm)           0
Description          0
Gross weight (kg)    0
RP02                 0
RM                   0
Quantity             0
AG                   0
AI                   0
dtype: int64


In [25]:
print("\nBasic statistics:")
print(supplier1_df.describe(include='all'))


Basic statistics:
       Quality/Choice  Grade               Finish  Thickness (mm)  \
count              50     50                   50       50.000000   
unique              3      3                    3             NaN   
top               3rd  C100S  gebeizt und geglüht             NaN   
freq               19     26                   19             NaN   
mean              NaN    NaN                  NaN        2.508800   
std               NaN    NaN                  NaN        0.273112   
min               NaN    NaN                  NaN        2.010000   
25%               NaN    NaN                  NaN        2.292500   
50%               NaN    NaN                  NaN        2.510000   
75%               NaN    NaN                  NaN        2.750000   
max               NaN    NaN                  NaN        2.980000   

         Width (mm)           Description  Gross weight (kg)       RP02  \
count     50.000000                    50          50.000000   50.00000   
un

In [27]:
print("Unique values per column:")
for col in supplier1_df.columns:
    uniques = supplier1_df[col].unique()[:5]  # first 5 unique values
    print(f"{col}: {supplier1_df[col].nunique()} unique values")
    print(f"  First 5 unique values: {list(uniques)}")

Unique values per column:
Quality/Choice: 3 unique values
  First 5 unique values: ['3rd', '2nd', '1st']
Grade: 3 unique values
  First 5 unique values: ['C200S', 'C300S', 'C100S']
Finish: 3 unique values
  First 5 unique values: ['gebeizt und geglüht', 'ungebeizt', 'gebeizt']
Thickness (mm): 39 unique values
  First 5 unique values: [np.float64(2.77), np.float64(2.65), np.float64(2.2), np.float64(2.86), np.float64(2.88)]
Width (mm): 5 unique values
  First 5 unique values: [np.int64(1100), np.int64(1075), np.int64(1050), np.int64(1000), np.int64(1150)]
Description: 3 unique values
  First 5 unique values: ['Längs- oder Querisse', 'Kantenfehler - FS-Kantenrisse', 'Sollmasse (Gewicht) unterschritten']
Gross weight (kg): 50 unique values
  First 5 unique values: [np.int64(13983), np.int64(13047), np.int64(14155), np.int64(11381), np.int64(10072)]
RP02: 4 unique values
  First 5 unique values: [np.float64(333.6), np.float64(717.7), np.float64(368.9), np.float64(0.0)]
RM: 4 unique values
 

In [28]:
# Step 3: Load and examine supplier_data2.xlsx
supplier2_df = pd.read_excel(supplier2_path)
supplier2_df.head()

Unnamed: 0,Material,Description,Article ID,Weight (kg),Quantity,Reserved
0,HDC,Material is Oiled,23048203,24469,52,NOT RESERVED
1,S235JR,Material is Oiled,23040547,16984,41,NOT RESERVED
2,S235JR,Material is Painted,23046057,9162,28,NOT RESERVED
3,DX51D +AZ150,Material is Oiled,23041966,12119,66,VANILLA
4,HDC,Material is Painted,23043884,17260,26,NOT RESERVED


In [29]:
print(f"Shape: {supplier2_df.shape}")

Shape: (50, 6)


In [30]:
print(f"\nData types:")
print(supplier2_df.dtypes)


Data types:
Material       object
Description    object
Article ID      int64
Weight (kg)     int64
Quantity        int64
Reserved       object
dtype: object


In [31]:
print(f"\nMissing values:")
print(supplier2_df.isnull().sum())


Missing values:
Material       0
Description    0
Article ID     0
Weight (kg)    0
Quantity       0
Reserved       0
dtype: int64


In [32]:
print("\nBasic statistics:")
print(supplier2_df.describe(include='all'))


Basic statistics:
       Material            Description    Article ID   Weight (kg)   Quantity  \
count        50                     50  5.000000e+01     50.000000  50.000000   
unique        4                      3           NaN           NaN        NaN   
top      S235JR  Material is not Oiled           NaN           NaN        NaN   
freq         19                     18           NaN           NaN        NaN   
mean        NaN                    NaN  2.304538e+07  16307.520000  57.920000   
std         NaN                    NaN  2.876349e+03   5669.798014  25.197052   
min         NaN                    NaN  2.304027e+07   5302.000000  10.000000   
25%         NaN                    NaN  2.304345e+07  11367.750000  33.250000   
50%         NaN                    NaN  2.304551e+07  16660.000000  62.000000   
75%         NaN                    NaN  2.304778e+07  21278.000000  74.500000   
max         NaN                    NaN  2.304991e+07  24951.000000  98.000000   

        

In [33]:
print("Unique values per column:")
for col in supplier2_df.columns:
    uniques = supplier2_df[col].unique()[:5]  # first 5 unique values
    print(f"{col}: {supplier2_df[col].nunique()} unique values")
    print(f"  First 5 unique values: {list(uniques)}")

Unique values per column:
Material: 4 unique values
  First 5 unique values: ['HDC', 'S235JR', 'DX51D +AZ150', 'DX51D +Z140']
Description: 3 unique values
  First 5 unique values: ['Material is Oiled', 'Material is Painted', 'Material is not Oiled']
Article ID: 50 unique values
  First 5 unique values: [np.int64(23048203), np.int64(23040547), np.int64(23046057), np.int64(23041966), np.int64(23043884)]
Weight (kg): 50 unique values
  First 5 unique values: [np.int64(24469), np.int64(16984), np.int64(9162), np.int64(12119), np.int64(17260)]
Quantity: 38 unique values
  First 5 unique values: [np.int64(52), np.int64(41), np.int64(28), np.int64(66), np.int64(26)]
Reserved: 2 unique values
  First 5 unique values: ['NOT RESERVED', 'VANILLA']
