In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../data/welddb_cleaned.csv')
data.info()

plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=False, cmap='viridis')
plt.xlabel('Features')
plt.ylabel('Samples')
plt.title('Missing Values in the Dataset')
plt.show()

In [None]:
# we have 4 main scenarios to handle missing values:
# - we have all the target values, except Charpy impact toughness
# - we have Charpy impact toughness,
# - we have all the target values
# - other cases (ie for example 2 target values)

target_labels = ['Yield strength / MPa', 
                 'Ultimate tensile strength / MPa', 
                 'Elongation / %', 
                 'Reduction of Area / %', 
                 'Charpy impact toughness / J']

scenarios = {
    'all_targets': target_labels,
    'all_targets_except_charpy': target_labels[:-1],
    'charpy_only': target_labels[-1],
    'other': []
}

target_df = data[target_labels]
target_df.fillna(np.nan, inplace=True)
print(target_df.head())

scenarios_count = {}
for s in scenarios.keys():
    scenarios_count[s] = 0

for i, row in target_df.iterrows():
    missing_values = row.isnull().sum()
    
    if missing_values == 0:
        scenarios_count['all_targets'] += 1
    elif missing_values == 1 and pd.isna(row['Charpy impact toughness / J']):
        scenarios_count['all_targets_except_charpy'] += 1
    elif missing_values == 4 and not pd.isna(row['Charpy impact toughness / J']):
        scenarios_count['charpy_only'] += 1
    else:
        scenarios_count['other'] += 1

print(scenarios_count)

plt.figure(figsize=(8, 8))
plt.pie(scenarios_count.values(), labels=scenarios_count.keys(), autopct='%1.1f%%', startangle=140)
plt.title('Data Distribution by Scenario')
plt.show()




In [None]:
quantitative = data.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(12, 6))
sns.heatmap(quantitative.corr(), cmap='viridis', annot=True)
plt.title('Correlation Heatmap')
plt.show()



In [None]:
# number of missing values for each feature
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values = missing_values.sort_values(ascending=False)
print(missing_values)

In [None]:
# clustering pour vérifier l'éventuelle présence de groupes aux caractéristiques similaires dans le dataset
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# drop target values
features = data.drop(target_labels, axis=1)
features.fillna(0, inplace=True)

# pca
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
features_pca = pca.fit_transform(features)

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_pca)

# we check optimal number of clusters
inertia = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(features_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(12, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# 3 seems to be the optimal number of clusters
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(features_scaled)
clusters = kmeans.predict(features_scaled)

# viz
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(features_pca[:, 0], features_pca[:, 1], features_pca[:, 2], c=clusters, cmap='viridis')
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_zlabel('PCA 3')
plt.title('Clusters')
plt.show()

# distribution des 6 targets pour chaque cluster
target_df['Cluster'] = clusters

plt.figure(figsize=(12, 6))
for i, target in enumerate(target_labels):
    plt.subplot(2, 3, i+1)
    sns.countplot(x='Cluster', data=target_df, hue=target)
    plt.title(target)
plt.tight_layout()
plt.show()









In [None]:
# ddifférence de distribution dans les clusters
plt.figure(figsize=(12, 6))

for i, target in enumerate(target_labels):
    plt.subplot(2, 3, i+1)
    temp_df = target_df[[target, 'Cluster']].dropna()
    sns.histplot(temp_df, x=target, hue='Cluster', multiple='stack', palette='viridis', kde=False)
    plt.title(f'Distribution of {target}')
    plt.xlabel(target)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()
