In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import sys

sys.path.append("../")
from adelle.utils import (
    model_generator,
    evaluate,
    get_Xy,
    preprocess,
    create_pca_analysis_pipeline,
    analyze_pca_feature_importance,
    preprocess_and_convert_to_df,
    print_linear_regression_pvalues,
    print_logistic_regression_pvalues,
)

In [2]:
df = pd.read_csv("../data/predictive_maintenance.csv")

In [3]:
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,No Failure


In [4]:
X_train, X_test, y_train, y_test = get_Xy(
    df, "Target", drop_cols=["UDI", "Product ID", "Failure Type"]
)

In [5]:
print(X_train)

     Type  Air temperature [K]  Process temperature [K]  \
4632    M                302.9                    311.2   
4368    L                302.0                    310.0   
5006    M                303.7                    312.8   
8179    L                299.8                    311.3   
3164    L                300.4                    309.7   
...   ...                  ...                      ...   
854     M                296.3                    307.3   
461     L                297.4                    308.7   
7983    L                301.0                    312.2   
3337    L                301.6                    310.8   
8809    L                297.3                    308.5   

      Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  
4632                    1330         53.7              200  
4368                    1664         28.6              190  
5006                    1366         47.3               49  
8179                    1560         35.4      

In [6]:
preprocessor = preprocess(X_train, y_train, label_cols=["Target"])

In [7]:
pca_pipeline = create_pca_analysis_pipeline(
    X_train, y_train, label_cols=None, n_components=5
)
analyze_pca_feature_importance(X_train, pca_pipeline)

Top contributing features by component:
Component 1:
  ss_encode_X__Torque [Nm]: -0.996
  minmax_encode_X__Rotational speed [rpm]: 0.092
  ohe__if-Type-c-L: 0.006
  ohe__if-Type-c-M: -0.006
  minmax_encode_X__Process temperature [K]: 0.003

Component 2:
  ohe__if-Type-c-L: -0.741
  ohe__if-Type-c-M: 0.667
  ohe__if-Type-c-H: 0.074
  ss_encode_X__Torque [Nm]: -0.008
  minmax_encode_X__Tool wear [min]: -0.002

Component 3:
  ohe__if-Type-c-H: 0.813
  ohe__if-Type-c-M: -0.470
  ohe__if-Type-c-L: -0.343
  minmax_encode_X__Air temperature [K]: -0.032
  minmax_encode_X__Process temperature [K]: -0.024

Component 4:
  minmax_encode_X__Air temperature [K]: -0.767
  minmax_encode_X__Process temperature [K]: -0.632
  minmax_encode_X__Tool wear [min]: -0.104
  ohe__if-Type-c-H: -0.032
  ohe__if-Type-c-M: 0.019

Component 5:
  minmax_encode_X__Tool wear [min]: 0.995
  minmax_encode_X__Air temperature [K]: -0.080
  minmax_encode_X__Process temperature [K]: -0.066
  ohe__if-Type-c-M: 0.004
  ohe__if

In [8]:
X_train_preprocessed_df, y_train_preprocessed = preprocess_and_convert_to_df(
    X_train, y_train, preprocessor
)

In [9]:
print_linear_regression_pvalues(X_train_preprocessed_df, y_train_preprocessed)


Linear Regression P-Values:
ss_encode_X__Torque [Nm]                    5.108440e-178
minmax_encode_X__Rotational speed [rpm]     2.967915e-123
minmax_encode_X__Tool wear [min]             5.382447e-24
minmax_encode_X__Air temperature [K]         6.337117e-20
minmax_encode_X__Process temperature [K]     2.897920e-11
ohe__if-Type-c-H                             4.654097e-02
ohe__if-Type-c-M                             4.654097e-02
ohe__if-Type-c-L                             4.654097e-02
const                                        4.654097e-02
dtype: float64


In [10]:
print_logistic_regression_pvalues(X_train_preprocessed_df, y_train_preprocessed)

Optimization terminated successfully.
         Current function value: 0.096335
         Iterations 9

Logistic Regression P-Values:
ss_encode_X__Torque [Nm]                    1.488773e-102
minmax_encode_X__Rotational speed [rpm]      2.432644e-83
minmax_encode_X__Tool wear [min]             1.856158e-26
minmax_encode_X__Air temperature [K]         1.361908e-19
minmax_encode_X__Process temperature [K]     3.944958e-11
const                                                 NaN
ohe__if-Type-c-H                                      NaN
ohe__if-Type-c-L                                      NaN
ohe__if-Type-c-M                                      NaN
dtype: float64
