In [69]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [71]:
# Загрузка данных
data = pd.read_csv('data_train_proc_step1_outlines.csv',sep=';')
data.head(10)

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
5,5,M24300,M,298.4,308.9,1429,42.1,65,0,0,0,0,0,0
6,6,L56736,L,299.6,311.0,1413,42.9,156,0,0,0,0,0,0
7,7,L55488,L,298.7,310.1,1609,38.1,67,0,0,0,0,0,0
8,8,L56397,L,297.7,308.8,1578,35.2,13,0,0,0,0,0,0
9,9,L55067,L,300.5,312.3,1447,53.3,98,0,0,0,0,0,0


In [73]:
# Удаление столбца Product ID (он не информативен)
data = data.drop(columns=['Product ID'])

In [75]:
# Обработаем столбец Type
data['Type'].value_counts()

Type
L    95304
M    32135
H     8921
Name: count, dtype: int64

In [77]:
# Применим LabelEncoding
encoder = LabelEncoder()
encoded_type_col = encoder.fit_transform(data['Type'])
data['Type'] = encoded_type_col
data['Type'].value_counts()

Type
1    95304
2    32135
0     8921
Name: count, dtype: int64

In [79]:
# Переименование некоторых столбцов для удобства (разобрано подробнее в спринте 2)
data = data.rename(columns={
    'TWF': 'Tool Wear Failure [TWF]',
    'HDF': 'Heat Dissipation Failure [HDF]',
    'PWF': 'Power Failure [PWF]',
    'OSF': 'Overstrain Failure [OSF]',
    'RNF': 'Random Failure [RNF]'
})
data.head(10)

Unnamed: 0,id,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Tool Wear Failure [TWF],Heat Dissipation Failure [HDF],Power Failure [PWF],Overstrain Failure [OSF],Random Failure [RNF]
0,0,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
5,5,2,298.4,308.9,1429,42.1,65,0,0,0,0,0,0
6,6,1,299.6,311.0,1413,42.9,156,0,0,0,0,0,0
7,7,1,298.7,310.1,1609,38.1,67,0,0,0,0,0,0
8,8,1,297.7,308.8,1578,35.2,13,0,0,0,0,0,0
9,9,1,300.5,312.3,1447,53.3,98,0,0,0,0,0,0


In [81]:
# Дополнительные параметры, которые решили добавить в спринте 2
# Суммарный параметр по индикаторам отказов
data['Sum_Parametr'] = data['Tool Wear Failure [TWF]'] + data['Heat Dissipation Failure [HDF]'] + data['Power Failure [PWF]'] + data['Overstrain Failure [OSF]'] + data['Random Failure [RNF]']
# Вычисление коэффициента соотношения и его среднего значения
data['Ratio'] = data['Rotational speed [rpm]'] / data['Torque [Nm]']
average_ratio = data['Ratio'].mean()
print(f"Средний коэффициент отклонения: {average_ratio:.2f}")

Средний коэффициент отклонения: 40.49


In [83]:
# Ещё один коэффициент из спринта 2
# Отклонения от среднего коэффициента
data['Deviation from Average'] = data['Ratio'] - average_ratio

In [85]:
data.head(10)

Unnamed: 0,id,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Tool Wear Failure [TWF],Heat Dissipation Failure [HDF],Power Failure [PWF],Overstrain Failure [OSF],Random Failure [RNF],Sum_Parametr,Ratio,Deviation from Average
0,0,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,0,44.210526,3.721024
1,1,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,0,60.446735,19.957233
2,2,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,0,68.113208,27.623705
3,3,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,0,34.401806,-6.087696
4,4,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,0,46.355932,5.86643
5,5,2,298.4,308.9,1429,42.1,65,0,0,0,0,0,0,0,33.942993,-6.546509
6,6,1,299.6,311.0,1413,42.9,156,0,0,0,0,0,0,0,32.937063,-7.552439
7,7,1,298.7,310.1,1609,38.1,67,0,0,0,0,0,0,0,42.230971,1.741469
8,8,1,297.7,308.8,1578,35.2,13,0,0,0,0,0,0,0,44.829545,4.340043
9,9,1,300.5,312.3,1447,53.3,98,0,0,0,0,0,0,0,27.148218,-13.341284


In [87]:
# В спринте 4 было показано, что признак Tool wear [min] влияет на целевую переменную:
# Отказы чаще случаются, если износ оборудования слишком маленький (новое оборудование)
# или слишком большой (изношенное оборудование)

In [89]:
# Добавим категориальный признак Twear_cat,
# его значения: 1 - маленький износ; 2 - большой износ; 0 - ни то, ни другое
data['Twear_cat'] = 0
data.loc[data[data['Tool wear [min]'] > 170].index, 'Twear_cat'] = 2
data.loc[data[data['Tool wear [min]'] < 30].index, 'Twear_cat'] = 1

In [91]:
data.head(10)

Unnamed: 0,id,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Tool Wear Failure [TWF],Heat Dissipation Failure [HDF],Power Failure [PWF],Overstrain Failure [OSF],Random Failure [RNF],Sum_Parametr,Ratio,Deviation from Average,Twear_cat
0,0,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0,0,44.210526,3.721024,0
1,1,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0,0,60.446735,19.957233,2
2,2,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0,0,68.113208,27.623705,1
3,3,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0,0,34.401806,-6.087696,2
4,4,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0,0,46.355932,5.86643,0
5,5,2,298.4,308.9,1429,42.1,65,0,0,0,0,0,0,0,33.942993,-6.546509,0
6,6,1,299.6,311.0,1413,42.9,156,0,0,0,0,0,0,0,32.937063,-7.552439,0
7,7,1,298.7,310.1,1609,38.1,67,0,0,0,0,0,0,0,42.230971,1.741469,0
8,8,1,297.7,308.8,1578,35.2,13,0,0,0,0,0,0,0,44.829545,4.340043,1
9,9,1,300.5,312.3,1447,53.3,98,0,0,0,0,0,0,0,27.148218,-13.341284,0


In [93]:
# Сохранение предобработанных данных
data.to_csv('data_train_proc_step2.csv', sep=';', index=False)