## Data Preparation for Machine Learning

In [1]:
import numpy as np # numpy is THE toolbox for scientific computing with python
import pandas as pd # pandas provides THE data structure and data analysis tools for data scientists 

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# seaborn plotting 
import seaborn as sns

In [2]:
data = pd.read_csv("../kaggle-data-sets/ai4i2020.csv")

### Feature Engineering

In [3]:
data['Power'] = 2 * np.pi * data['Rotational speed [rpm]'] * data['Torque [Nm]'] / 60
data['temp_diff'] = data['Process temperature [K]'] - data['Air temperature [K]']

In [5]:
data.columns

Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF',
       'Power', 'temp_diff'],
      dtype='object')

In [6]:
# UDI and Product ID are high cardinality features, Removing Process Temperature due to multi-collinearity
data = data.drop(['UDI', 'Product ID','TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)

In [7]:
data.head(4)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Power,temp_diff
0,M,298.1,308.6,1551,42.8,0,0,6951.59056,10.5
1,L,298.2,308.7,1408,46.3,3,0,6826.722724,10.5
2,L,298.1,308.5,1498,49.4,5,0,7749.387543,10.4
3,L,298.2,308.6,1433,39.5,7,0,5927.504659,10.4


In [8]:
data = pd.get_dummies(data)
data.head(4)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Power,temp_diff,Type_H,Type_L,Type_M
0,298.1,308.6,1551,42.8,0,0,6951.59056,10.5,False,False,True
1,298.2,308.7,1408,46.3,3,0,6826.722724,10.5,False,True,False
2,298.1,308.5,1498,49.4,5,0,7749.387543,10.4,False,True,False
3,298.2,308.6,1433,39.5,7,0,5927.504659,10.4,False,True,False


In [9]:
data[['Type_H', 'Type_L', 'Type_M']] = data[['Type_H', 'Type_L', 'Type_M']].astype('int')
data.head(4)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Power,temp_diff,Type_H,Type_L,Type_M
0,298.1,308.6,1551,42.8,0,0,6951.59056,10.5,0,0,1
1,298.2,308.7,1408,46.3,3,0,6826.722724,10.5,0,1,0
2,298.1,308.5,1498,49.4,5,0,7749.387543,10.4,0,1,0
3,298.2,308.6,1433,39.5,7,0,5927.504659,10.4,0,1,0


In [11]:
data.to_csv("../kaggle-data-sets/ai4i2020_prepared.csv", header=True, index=False)