In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

In [3]:
df=pd.read_csv("/data/predictive_maintenance.csv")
df = df.drop(["UDI","Product ID"],axis=1)
df.sample(6).style.set_properties(
    **{
        'background-color': 'purple',
        'color': 'white',
        'border-color': 'White'
    })

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
1323,L,298.9,310.1,1361,51.7,192,0,No Failure
4302,L,301.7,310.1,2695,12.6,10,0,No Failure
2544,L,299.2,309.0,1951,21.4,102,0,No Failure
9376,H,297.7,308.4,1738,27.7,96,0,No Failure
828,L,296.5,307.3,1566,37.2,164,0,No Failure
8447,L,298.6,310.1,1439,49.9,17,0,No Failure


In [4]:
## Converting temperature in centigrate from Kelvin [1 K = -272.15 °C  ] 

df["Air temperature [K]"] = df["Air temperature [K]"] - 272.15
df["Process temperature [K]"] = df["Process temperature [K]"] - 272.15

# Renaming temperature in Centigrate(°C) from Kelvin (K)
df.rename(columns={"Air temperature [K]" : "Air temperature [°C]","Process temperature [K]" : "Process temperature [°C]"},inplace=True)

In [5]:
df["Temperature difference [°C]"] = df["Process temperature [°C]"] - df["Air temperature [°C]"]
df.sample(5)

Unnamed: 0,Type,Air temperature [°C],Process temperature [°C],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Temperature difference [°C]
3910,L,30.25,39.25,1831,26.9,121,0,No Failure,9.0
6006,M,28.45,38.35,2156,18.0,16,0,No Failure,9.9
2765,M,27.75,37.25,1399,54.8,3,0,No Failure,9.5
1338,L,26.45,37.95,1721,27.2,5,0,No Failure,11.5
8800,L,25.25,36.35,1475,47.2,69,0,No Failure,11.1


In [6]:
display(df.shape)
display(df.size)

(10000, 9)

90000

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Type                         10000 non-null  object 
 1   Air temperature [°C]         10000 non-null  float64
 2   Process temperature [°C]     10000 non-null  float64
 3   Rotational speed [rpm]       10000 non-null  int64  
 4   Torque [Nm]                  10000 non-null  float64
 5   Tool wear [min]              10000 non-null  int64  
 6   Target                       10000 non-null  int64  
 7   Failure Type                 10000 non-null  object 
 8   Temperature difference [°C]  10000 non-null  float64
dtypes: float64(4), int64(3), object(2)
memory usage: 703.2+ KB


In [8]:
df.describe().style.background_gradient(cmap="magma")

Unnamed: 0,Air temperature [°C],Process temperature [°C],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Temperature difference [°C]
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,27.85493,37.85556,1538.7761,39.98691,107.951,0.0339,10.00063
std,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,1.001094
min,23.15,33.55,1168.0,3.8,0.0,0.0,7.6
25%,26.15,36.65,1423.0,33.2,53.0,0.0,9.3
50%,27.95,37.95,1503.0,40.1,108.0,0.0,9.8
75%,29.35,38.95,1612.0,46.8,162.0,0.0,11.0
max,32.35,41.65,2886.0,76.6,253.0,1.0,12.1


# Feature Selection

In [9]:
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=['Type','Failure Type'])
df = encoder.fit_transform(df)
df.head(2)

Unnamed: 0,Type,Air temperature [°C],Process temperature [°C],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Temperature difference [°C]
0,1,25.95,36.45,1551,42.8,0,0,1,10.5
1,2,26.05,36.55,1408,46.3,3,0,1,10.5


In [10]:
from sklearn.preprocessing import LabelEncoder
scaler = LabelEncoder()
df['Failure Type'] = scaler.fit_transform(df['Failure Type'])

In [11]:
X = df.drop(columns="Failure Type" , axis=1)
y = df["Failure Type"]

In [12]:
df['Failure Type'].value_counts()

0    9652
5     112
1      95
3      78
2      45
4      18
Name: Failure Type, dtype: int64

In [13]:
df_copy=df.copy()

In [14]:
df_copy.drop(['Type', 'Failure Type'], axis=1, inplace=True)

In [15]:
df_copy.head()

Unnamed: 0,Air temperature [°C],Process temperature [°C],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Temperature difference [°C]
0,25.95,36.45,1551,42.8,0,0,10.5
1,26.05,36.55,1408,46.3,3,0,10.5
2,25.95,36.35,1498,49.4,5,0,10.4
3,26.05,36.45,1433,39.5,7,0,10.4
4,26.05,36.55,1408,40.0,9,0,10.5


In [16]:
df_copy.Target.value_counts()

0    9661
1     339
Name: Target, dtype: int64

In [17]:
df_copy.to_csv('/data/binaryclassification_modelinput.csv')