In [1]:
import datetime
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns 
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import feature_column
from sklearn.model_selection import train_test_split

In [2]:
# IF DATA IS IN YOUR DRIVE
data = pd.read_excel('BLACK_BELT_DATABASE_CASE_COMPLETE_TESTE.xlsx', header=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6264 entries, 0 to 6263
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   PART_NUMBER           6264 non-null   object        
 1   REV                   6264 non-null   object        
 2   DESCRIPTION           6264 non-null   object        
 3   CONFIGURATION         6264 non-null   object        
 4   RELEASED_DATE         6264 non-null   datetime64[ns]
 5   OBJECT_ID_3D          6262 non-null   float64       
 6   FILE_NAME_3D          0 non-null      float64       
 7   CLASS_3D              6264 non-null   object        
 8   DRAWING_CODE_3D       6259 non-null   object        
 9   ATP_3D                6262 non-null   object        
 10  QTN_REV_3D            6264 non-null   int64         
 11  MEAN_SIZE_3D          1697 non-null   object        
 12  OBJECT_ID_2D          6264 non-null   int64         
 13  FILE_NAME_2D      

In [3]:
# Função para pré-processar os dados
def preProcessingDataBase(data):

    to_drop = ['NEW_DEV',
               'HOV',
               'RTF',
               'COMPLEXITY',
               'CREATED_ON',
               'COMPLETED_ON',
               'CLASS_2D',
               'PART_NUMBER',
               'REV',
               'DESCRIPTION',
               'CONFIGURATION',
               'RELEASED_DATE',
               'OBJECT_ID_3D',
               'FILE_NAME_3D',
               'CLASS_3D',
               'DRAWING_CODE_3D',
               'ATP_3D',
               'OBJECT_ID_2D',
               'FILE_NAME_2D',
               'MEAN_SIZE_3D',
               'MEAN_SIZE_2D', 
               'QTY_SHEETS',
               'QTY_DIMENSIONS',
               'QTY_VIEWS',
               'QTY_PART_LIST',
               'QTY_TEXT_INFORMATION',
               'TRIM_AND_FINISH',
               'NEW_DESIGN']

    data.drop(to_drop, inplace=True, axis=1)

    # QTN_REV_3D
    data = data[~data['QTN_REV_3D'].isnull()]
    data['QTN_REV_3D'] = data['QTN_REV_3D'].dropna()
    data['QTN_REV_3D'] = (data['QTN_REV_3D']-data['QTN_REV_3D'].min())/(data['QTN_REV_3D'].max()-data['QTN_REV_3D'].min())

    # WORKFLOW
    data = data[~data['WORKFLOW'].isnull()]
    data['WORKFLOW'] = data['WORKFLOW'].dropna()

    # DRAWING_CODE
    data = data[~data['DRAWING_CODE'].isnull()]
    data['DRAWING_CODE'] = data['DRAWING_CODE'].dropna()
    #data = data.join(pd.get_dummies(data.pop('DRAWING_CODE')))

    # ATP
    data = data[~data['ATP'].isnull()]
    data['ATP'] = data['ATP'].dropna()
    #data = data.join(pd.get_dummies(data.pop('ATP')))

    # QTN_REV_2D
    data = data[~data['QTN_REV_2D'].isnull()]
    data['ATP'] = data['QTN_REV_2D'].dropna()
    data['QTN_REV_2D'] = (data['QTN_REV_2D']-data['QTN_REV_2D'].min())/(data['QTN_REV_2D'].max()-data['QTN_REV_2D'].min())

    # QTY_ECN_2D
    data = data[~data['QTY_ECN_2D'].isnull()]
    data['QTY_ECN_2D'] = data['QTY_ECN_2D'].dropna()
    data.loc[(data.QTY_ECN_2D != 0), 'QTY_ECN_2D'] = "RUIM"
    data.loc[(data.QTY_ECN_2D == 0), 'QTY_ECN_2D'] = "BOM"
    data['QTY_ECN_2D'] = pd.Series(np.searchsorted(['BOM', 'RUIM'], data.QTY_ECN_2D.values), data.index)
    #data['QTY_ECN_2D'] = (data['QTY_ECN_2D']-data['QTY_ECN_2D'].min())/(data['QTY_ECN_2D'].max()-data['QTY_ECN_2D'].min())

    # LEAD_TIME
    data = data[~data['LEAD_TIME'].isnull()]
    data['LEAD_TIME'] = data['LEAD_TIME'].dropna()
    data['LEAD_TIME'] = data['LEAD_TIME'][data['LEAD_TIME'] > 0]
    data['LEAD_TIME'] = data['LEAD_TIME'].astype('int')
    data['LEAD_TIME'] = (data['LEAD_TIME']-data['LEAD_TIME'].min())/(data['LEAD_TIME'].max()-data['LEAD_TIME'].min())

    # DROP ANY ROW NULL
    #data = data.dropna()

    return data

In [4]:
data = preProcessingDataBase(data)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5793 entries, 0 to 6136
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   QTN_REV_3D    5793 non-null   float64
 1   DRAWING_CODE  5793 non-null   object 
 2   ATP           5793 non-null   int64  
 3   QTN_REV_2D    5793 non-null   float64
 4   QTY_ECN_2D    5793 non-null   int64  
 5   WORKFLOW      5793 non-null   object 
 6   LEAD_TIME     5793 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 362.1+ KB


Unnamed: 0,QTN_REV_3D,DRAWING_CODE,ATP,QTN_REV_2D,QTY_ECN_2D,WORKFLOW,LEAD_TIME
0,0.111111,PL,1,0.0,0,SJ_Process-163227,0.082192
1,0.111111,DA,1,0.0,0,SJ_Process-163703,0.065332
2,0.111111,NM,1,0.0,0,SJ_Process-163703,0.065332
3,0.222222,IN,2,0.125,1,GG_Process-029086,0.062171
4,0.111111,TA,1,0.0,0,SJ_Process-163420,0.076923


In [5]:
# Post pré-processing
data, validation_data = train_test_split(data, test_size=0.3)
validation_data, test_data = train_test_split(validation_data, test_size=0.5)

In [None]:
## Contar saídas
data["QTY_ECN_2D"].value_counts()

In [None]:
## Balancear saídas
data = data.groupby('QTY_ECN_2D').sample(n=5600)

In [None]:
## Visualizar
sample_data = data.sample(frac=0.3)
sns.pairplot(sample_data, hue="QTY_ECN_2D", palette="tab10")

In [6]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('QTY_ECN_2D')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
to_feature = ['QTN_REV_3D', 
              'MEAN_SIZE_3D', 
              'QTN_REV_2D', 
              'MEAN_SIZE_2D', 
              'QTY_SHEETS', 
              'QTY_DIMENSIONS', 
              'QTY_VIEWS', 
              'QTY_PART_LIST', 
              'QTY_TEXT_INFORMATION', 
              'LEAD_TIME', 
              'TRIM_AND_FINISH', 
              'LEAD_TO_RELEASE']

In [7]:
feature_columns = []

to_feature = ['QTN_REV_3D', 
              'QTN_REV_2D', 
              'LEAD_TIME']

for header in to_feature:
  feature_columns.append(feature_column.numeric_column(header))


feature_columns.append(feature_column.embedding_column(feature_column.categorical_column_with_vocabulary_list('ATP', data.ATP.unique()), dimension=8))
feature_columns.append(feature_column.embedding_column(feature_column.categorical_column_with_vocabulary_list('WORKFLOW', data.DRAWING_CODE.unique()), dimension=8))
feature_columns.append(feature_column.embedding_column(feature_column.categorical_column_with_vocabulary_list('DRAWING_CODE', data.DRAWING_CODE.unique()), dimension=8))

In [8]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [9]:
batch_size = 8
train_ds = df_to_dataset(data, shuffle=True, batch_size=batch_size)
validation_ds = df_to_dataset(validation_data, shuffle=True, batch_size=batch_size)
test_ds = df_to_dataset(test_data, shuffle=False, batch_size=batch_size)

2022-10-07 11:43:54.364040: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: system has unsupported display driver / cuda driver combination
2022-10-07 11:43:54.364074: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: matheus-notebook
2022-10-07 11:43:54.364082: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: matheus-notebook
2022-10-07 11:43:54.364236: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.85.2
2022-10-07 11:43:54.364268: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1
2022-10-07 11:43:54.364278: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 515.65.1 does not match DSO version 510.85.2 -- cannot find working devices in this configuration
2022-10-07 11:43:54.365624: I tensorflow/core/platform/cpu_feature_guard.cc:193]

In [13]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(16, activation='relu'),
  layers.Dropout(.5),
  layers.Dense(8, activation='relu'),
  layers.Dropout(.25),
  layers.Dense(1, activation='relu'),
  layers.Dropout(0.125),
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])


log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

subprocess.run("tensorboard --logdir /home/matheus/Devtools/safran-black-belt/logs/fit")

model.fit(train_ds,
          validation_data=train_ds,
          epochs=10000, 
          callbacks=[tensorboard_callback])



Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

KeyboardInterrupt: 

In [24]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.9597238302230835
