In [1]:
import pandas as pd

In [2]:
data_df = pd.read_csv('ai4i2020.csv')
data_df.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


## Prepare data file

Create data file for training and evaluation

In [3]:
data_columns = ['Machine failure',
                'Air temperature [K]', 
                'Process temperature [K]', 
                'Rotational speed [rpm]', 
                'Torque [Nm]', 
                'Tool wear [min]']

rename_columns = {'Machine failure': 'y',
                  'Air temperature [K]': 'air_temperature',
                  'Process temperature [K]': 'process_temperature',
                  'Rotational speed [rpm]': 'rotational_speed',
                  'Torque [Nm]': 'torque',
                  'Tool wear [min]': 'tool_wear',
                  'H': 'high',
                  'L': 'low',
                  'M': 'medium'}


 
feature_df = pd.concat([data_df[data_columns], pd.get_dummies(data_df['Type'])], axis=1)
feature_df.rename(columns=rename_columns, inplace=True)
feature_df.tail()

Unnamed: 0,y,air_temperature,process_temperature,rotational_speed,torque,tool_wear,high,low,medium
9995,0,298.8,308.4,1604,29.5,14,0,0,1
9996,0,298.9,308.4,1632,31.8,17,1,0,0
9997,0,299.0,308.6,1645,33.4,22,0,0,1
9998,0,299.0,308.7,1408,48.5,25,1,0,0
9999,0,299.0,308.7,1500,40.2,30,0,0,1


## Upsample

In [5]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 15.8 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.8.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_mxnet_p36/bin/python -m pip install --upgrade pip' command.[0m


In [6]:
from imblearn.over_sampling import SMOTE

In [7]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(feature_df.iloc[:,1:], feature_df['y'])

In [8]:
from collections import Counter
print('Original dataset shape %s' % Counter(feature_df['y']))
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 9661, 1: 339})
Resampled dataset shape Counter({0: 9661, 1: 9661})


In [9]:
feature_res_df = pd.concat([y_res, X_res], axis=1)
feature_res_df.tail()

Unnamed: 0,y,air_temperature,process_temperature,rotational_speed,torque,tool_wear,high,low,medium
19317,1,300.499654,309.9,1396,45.901385,210,1,0,0
19318,1,298.416432,308.583621,2677,10.704919,85,0,0,1
19319,1,299.552915,311.008801,1717,27.882287,221,0,0,0
19320,1,301.258978,310.516465,1688,30.454791,226,0,0,1
19321,1,302.020253,310.550316,1369,46.797472,71,0,0,1


## Min/Max Scalar

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
scalar_columns = [
               'air_temperature',
               'process_temperature',
               'rotational_speed',
               'torque',
               'tool_wear']

scalar = MinMaxScaler()
scalar.fit(feature_res_df[scalar_columns])

for scale_column, scale_min, scale_max in zip(scalar_columns, scalar.data_min_, scalar.data_max_):
    print(f'{scale_column} - min: {scale_min} -- max: {scale_max}')

air_temperature - min: 295.3 -- max: 304.5
process_temperature - min: 305.7 -- max: 313.8
rotational_speed - min: 1168.0 -- max: 2886.0
torque - min: 3.8 -- max: 76.6
tool_wear - min: 0.0 -- max: 253.0


In [12]:
scaled_df = pd.DataFrame(scalar.transform(feature_res_df[scalar_columns]), columns=scalar_columns)
for column in scalar_columns:
    feature_res_df[column] = scaled_df[column]

feature_res_df.describe()

Unnamed: 0,y,air_temperature,process_temperature,rotational_speed,torque,tool_wear,high,low,medium
count,19322.0,19322.0,19322.0,19322.0,19322.0,19322.0,19322.0,19322.0,19322.0
mean,0.5,0.560535,0.550039,0.202576,0.564789,0.493991,0.053928,0.552634,0.188593
std,0.500013,0.212056,0.163763,0.170538,0.193109,0.275642,0.225882,0.497235,0.391195
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.388189,0.432099,0.110012,0.442244,0.256917,0.0,0.0,0.0
50%,0.5,0.576087,0.567901,0.150175,0.587912,0.501976,0.0,1.0,0.0
75%,1.0,0.75,0.660174,0.229919,0.700907,0.762846,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Upload training data
Now that we've created our dataset, we'll need to upload it to S3, so that Amazon SageMaker training can use it.

In [15]:
import boto3
import sagemaker
import s3fs


role = sagemaker.get_execution_role()
sess = sagemaker.Session()

bucket = sess.default_bucket()
prefix = "marcus-machine-failure"


s3 = s3fs.S3FileSystem(anon=False)

key = "ai4i2020_prep.csv"
with s3.open(f'{bucket}/{prefix}/{key}','w') as f:
    feature_res_df.to_csv(f, index=False)