# Essai modélisation
### Imports

In [112]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings("ignore")

In [113]:
df = pd.read_csv(r'C:\Users\ccdeb\Documents\CS\3A\APPRAUTO\TP_soudure_new\tp_appr_auto\data\preprocessed_data.csv')

In [114]:
df.shape

(1463, 33)

In [115]:
df.columns

Index(['C concentration (weight%)', 'Si concentration (weight%)',
       'Mn concentration (weight%)', 'S concentration (weight%)',
       'P concentration (weight%)', 'V concentration (weight%)',
       'O concentration (ppm/weight)', 'Ti concentration (ppm/weight)',
       'N concentration (ppm/weight)', 'Al concentration (ppm/weight)',
       'Current (A)', 'Voltage (V)', 'Heat input (kJ/mm)',
       'Interpass temperature (deg C)',
       'Post weld heat treatment temperature (deg C)',
       'Post weld heat treatment time (hours)', 'Yield strength (MPa)',
       'Ultimate tensile strength (MPa)', 'Elongation (%)',
       'Reduction of Area (%)', 'Charpy temperature (deg C)',
       'Charpy impact toughness (J)', 'AC', 'electrode positive',
       'Type of weld_GMAA', 'Type of weld_GTAA', 'Type of weld_MMA',
       'Type of weld_NGGMA', 'Type of weld_NGSAW', 'Type of weld_SA',
       'Type of weld_SAA', 'Type of weld_ShMA', 'Type of weld_TSA'],
      dtype='object')

## Stratégie cible
On se rend compte qu'il n'y a pas de variable cible claire. Je propose donc d'établir une définition de la cible basée sur une combinaison des seuils sur les tests. Cela me permettra d'avoir une variable cible booléenne. 
 
Voici la liste des colonnes représentant les tests : 

In [116]:
col_test = ['Yield strength (MPa)','Ultimate tensile strength (MPa)', 'Elongation (%)',
            'Reduction of Area (%)','Charpy impact toughness (J)']

In [117]:
#On regarde le nombre de valeurs qui n'ont aucun test
dfd = df.copy()
Lt=[]
dfd['nb'] = 0
for c in col_test:
    n=c+'_test'
    dfd[n] = np.where(dfd[c].isna(),0,1)
    Lt.append(n)
    dfd['nb'] = dfd['nb'] + dfd[n]

dfd['nb'].value_counts()


nb
2    622
4    500
6    134
0     82
1     63
3     52
5     10
Name: count, dtype: int64

Sur 1463 pièces, 82 n'ont aucuns tests. Vu la quantité, on peut sûrement les supprimer.

In [118]:
print(df.shape,'\n')
for k in col_test:
    print(k,':',df[k].nunique())

(1463, 33) 

Yield strength (MPa) : 325
Ultimate tensile strength (MPa) : 303
Elongation (%) : 135
Reduction of Area (%) : 122
Charpy temperature (deg C) : 138
Charpy impact toughness (J) : 143


Sur plus de 1400 lignes, on se rend compte qu'il n'y a pas tant de valeurs différentes que ça. Il serait intéressant d'appliquer un algorithme des plus proches voisins afin de prédire les valeurs manquantes. En plus les variables sont prêtes pour un KNN vu qu'elles sont normalisées  
## Application KNN

In [119]:
#Récupération des colonnes quanti
quanti = []
for k in df:
    if (df[k].dtype==np.float64) and k not in col_test:
        quanti.append(k)

In [121]:
#Préparation du df qui va recevoir les nouvelles colonnes de tests
dfknn = df.copy().drop(columns=col_test)

for c in col_test:
    #préparation du df pour appliquer le knn et prédire la colonne test 'c' 
    col = quanti+[c]
    dfknn = df[col]

    #application du knn sur les données quanti pour prédire c
    imputer = KNNImputer(n_neighbors=5)
    dfknn = imputer.fit_transform(dfknn)

    #mise en forme du tableau intermédiaire
    dfknn = pd.DataFrame(dfknn)
    dfknn.columns = col
    dfknn = dfknn.reset_index()

    #merge du df résultat et des résultats du knn
    df2[c] = dfknn[c]

In [135]:
dfknn

Unnamed: 0,index,C concentration (weight%),Si concentration (weight%),Mn concentration (weight%),S concentration (weight%),P concentration (weight%),V concentration (weight%),O concentration (ppm/weight),Ti concentration (ppm/weight),N concentration (ppm/weight),Al concentration (ppm/weight),Current (A),Voltage (V),Heat input (kJ/mm),Interpass temperature (deg C),Post weld heat treatment temperature (deg C),Post weld heat treatment time (hours),Charpy impact toughness (J)
0,0,-1.794624,-0.241903,-1.474090,-0.150353,-0.062002,1.542981e-17,-1.143271e-16,0.0,0.000000,8.694071e-17,-0.633763,-0.553201,-0.561308,-0.117894,-0.146947,1.374196,-0.266594
1,1,-1.794624,-0.241903,-1.474090,-0.150353,-0.062002,1.542981e-17,-1.143271e-16,0.0,0.000000,8.694071e-17,-0.633763,-0.553201,-0.561308,-0.117894,1.009610,-0.538309,0.336142
2,2,-1.794624,-0.157085,-0.467487,-0.234530,0.034196,1.542981e-17,-1.143271e-16,0.0,0.000000,8.694071e-17,-0.633763,-0.553201,-0.561308,-0.117894,-0.146947,1.374196,-0.266594
3,3,-1.794624,-0.157085,-0.467487,-0.234530,0.034196,1.542981e-17,-1.143271e-16,0.0,0.000000,8.694071e-17,-0.633763,-0.553201,-0.561308,-0.117894,1.009610,-0.538309,0.336142
4,4,-1.474696,0.182188,0.592095,-0.234530,0.034196,1.542981e-17,-1.143271e-16,0.0,0.000000,8.694071e-17,-0.633763,-0.553201,-0.561308,-0.117894,-0.146947,1.374196,-0.567962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458,1458,1.084728,0.267007,-0.732382,-0.150353,0.130393,2.066602e-01,-4.697737e-01,0.0,3.215642,8.694071e-17,0.774825,0.137475,0.184398,-0.117894,1.640459,-0.538309,-1.112099
1459,1459,1.084728,0.182188,-0.811851,-0.150353,0.130393,1.662291e-01,-4.968866e-01,0.0,3.025960,8.694071e-17,0.774825,0.137475,0.184398,-0.117894,1.640459,-0.538309,-1.112099
1460,1460,0.536280,0.267007,-0.864830,-0.150353,0.178492,1.864446e-01,-2.077631e-03,0.0,2.983809,8.694071e-17,0.774825,0.137475,0.184398,-0.117894,1.640459,-0.538309,-0.860959
1461,1461,0.627688,0.097370,-0.838341,-0.150353,0.130393,-1.572193e-01,-3.070968e-01,0.0,-0.156478,8.694071e-17,0.774825,0.137475,0.184398,-0.117894,1.640459,-0.538309,0.336142


### Normalisation des seuils 

In [131]:
#Récupération des mean/std
data = pd.read_table("../data/welddb.data",sep = " ", header=None)
data.replace('N', None, inplace=True)
data.columns = ["C concentration (weight%)","Si concentration (weight%)", "Mn concentration (weight%)","S concentration (weight%)", "P concentration (weight%)", "Ni concentration (weight%)", "Cr concentration (weight%)", "Mo concentration (weight%)", "V concentration (weight%)", "Cu concentration (weight%)", "Co concentration (weight%)", "W concentration (weight%)", "O concentration (ppm/weight)", "Ti concentration (ppm/weight)", "N concentration (ppm/weight)", "Al concentration (ppm/weight)", "B concentration (ppm/weight)", "Nb concentration (ppm/weight)", "Sn concentration (ppm/weight)", "As concentration (ppm/weight)", "Sb concentration (ppm/weight)", "Current (A)", "Voltage (V)", "AC or DC", "Electrode positive or negative", "Heat input (kJ/mm)", "Interpass temperature (deg C)", "Type of weld", "Post weld heat treatment temperature (deg C)", "Post weld heat treatment time (hours)", "Yield strength (MPa)", "Ultimate tensile strength (MPa)", "Elongation (%)", "Reduction of Area (%)", "Charpy temperature (deg C)", "Charpy impact toughness (J)", "Hardness (kg/mm2)", "50 FATT", "Primary ferrite in microstructure (%)", "Ferrite with second phase (%)", "Acicular ferrite (%)", "Martensite(%)", "Ferrite with carbide aggreagate (%)", "Weld ID"]
data = data.replace("<","",regex=True)
data['N concentration (ppm/weight)'] = data['N concentration (ppm/weight)'].str.split("tot").str[0]
data['Hardness (kg/mm2)'] = data['Hardness (kg/mm2)'].str.split("(").str[0]
data['Hardness (kg/mm2)'] = data['Hardness (kg/mm2)'].str.split("H").str[0]
data['Interpass temperature (deg C)'] = data['Interpass temperature (deg C)'].replace('150-200','175')
for i, column in enumerate(data.columns):
    if i not in [23,24,27,43]:
        data[column] = data[column].astype(float)

data.columns = data.columns.map(str)
data = data[col_test]
means = data.mean()
stds = data.std()

### Défintion des seuils  
Vu que je ne me suis pas spécialisée dans l'ingénieurerie mécanique je me suis aidée de chatgpt pour les définir 

In [132]:
Seuil : UTS ≥ 500 MPa
Justification : Une UTS élevée signifie que la soudure peut supporter des forces importantes sans se rompre, un indicateur de robustesse.
Résistance à la limite d'élasticité (Yield Strength) :

Seuil : Yield Strength ≥ 350 MPa
Justification : Une résistance élevée à la limite d'élasticité indique que la soudure peut résister à des déformations importantes avant d'atteindre son point de rupture.
Allongement (Elongation) :

Seuil : Elongation ≥ 20%
Justification : Un allongement supérieur signifie une meilleure ductilité, ce qui rend la soudure moins susceptible de se fissurer sous des charges fluctuantes ou des contraintes soudaines.
Ténacité à l'impact (Charpy Impact Toughness) :

Seuil : Charpy Impact Toughness ≥ 50 J
Justification : Une ténacité élevée est nécessaire pour résister aux chocs et éviter les ruptures fragiles.
Dureté (Hardness) :

Plage : 200 ≤ Hardness ≤ 350 HV (kg/mm²)
Justification : Une dureté trop faible peut indiquer un manque de résistance à l'usure, tandis qu'une dureté trop élevée peut rendre la soudure fragile.



508.55717948717944

In [None]:
Seuils = {
    'Yield strength (MPa)' : [350,],
    'Ultimate tensile strength (MPa)' : [500,],
    'Elongation (%)':[20,],
    'Reduction of Area (%)':[20,],
    'Charpy impact toughness (J)':[50,]
}

In [134]:
df.columns

Index(['C concentration (weight%)', 'Si concentration (weight%)',
       'Mn concentration (weight%)', 'S concentration (weight%)',
       'P concentration (weight%)', 'V concentration (weight%)',
       'O concentration (ppm/weight)', 'Ti concentration (ppm/weight)',
       'N concentration (ppm/weight)', 'Al concentration (ppm/weight)',
       'Current (A)', 'Voltage (V)', 'Heat input (kJ/mm)',
       'Interpass temperature (deg C)',
       'Post weld heat treatment temperature (deg C)',
       'Post weld heat treatment time (hours)', 'Yield strength (MPa)',
       'Ultimate tensile strength (MPa)', 'Elongation (%)',
       'Reduction of Area (%)', 'Charpy temperature (deg C)',
       'Charpy impact toughness (J)', 'AC', 'electrode positive',
       'Type of weld_GMAA', 'Type of weld_GTAA', 'Type of weld_MMA',
       'Type of weld_NGGMA', 'Type of weld_NGSAW', 'Type of weld_SA',
       'Type of weld_SAA', 'Type of weld_ShMA', 'Type of weld_TSA'],
      dtype='object')

In [133]:
col_test

['Yield strength (MPa)',
 'Ultimate tensile strength (MPa)',
 'Elongation (%)',
 'Reduction of Area (%)',
 'Charpy temperature (deg C)',
 'Charpy impact toughness (J)']