In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../data/welddb_cleaned.csv')
data.info()

plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.xlabel('Features')
plt.ylabel('Samples')
plt.title('Missing Values in the Dataset')
plt.show()

In [None]:
# we have 4 main scenarios to handle missing values:
# - we have all the target values, except Charpy impact toughness
# - we have Charpy impact toughness,
# - we have all the target values
# - other cases (ie for example 2 target values)

target_labels = ['Yield strength / MPa', 
                 'Ultimate tensile strength / MPa', 
                 'Elongation / %', 
                 'Reduction of Area / %', 
                 'Charpy impact toughness / J']

scenarios = {
    'all_targets': target_labels,
    'all_targets_except_charpy': target_labels[:-1],
    'charpy_only': target_labels[-1],
    'other': []
}

target_df = data[target_labels]
target_df.fillna(np.nan, inplace=True)
print(target_df.head())

scenarios_count = {}
for s in scenarios.keys():
    scenarios_count[s] = 0

for i, row in target_df.iterrows():
    missing_values = row.isnull().sum()
    
    if missing_values == 0:
        scenarios_count['all_targets'] += 1
    elif missing_values == 1 and pd.isna(row['Charpy impact toughness / J']):
        scenarios_count['all_targets_except_charpy'] += 1
    elif missing_values == 4 and not pd.isna(row['Charpy impact toughness / J']):
        scenarios_count['charpy_only'] += 1
    else:
        scenarios_count['other'] += 1

print(scenarios_count)

plt.figure(figsize=(8, 8))
plt.pie(scenarios_count.values(), labels=scenarios_count.keys(), autopct='%1.1f%%', startangle=140)
plt.title('Data Distribution by Scenario')
plt.show()




In [None]:
quantitative = data.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(12, 6))
sns.heatmap(quantitative.corr(), cmap='viridis', annot=True)
plt.title('Correlation Heatmap')
plt.show()

