In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter


In [18]:
data = pd.read_csv('processed_train.csv', sep=';') 

X = data.drop('Machine failure', axis=1)  
y = data['Machine failure']

print("До SMOTE:", Counter(y))

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("После SMOTE:", Counter(y_resampled))

X_resampled_df = pd.DataFrame(X_resampled)
y_resampled_df = pd.DataFrame(y_resampled, columns=['Machine failure'])

balanced_data = pd.concat([X_resampled_df, y_resampled_df], axis=1)
balanced_data.to_csv("balanced_dataset.csv", index=False)


До SMOTE: Counter({0: 134281, 1: 2148})
После SMOTE: Counter({0: 134281, 1: 134281})


In [19]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', 1900)        

In [20]:
data = pd.read_csv('balanced_dataset.csv') 

print(data.head())

   id  Air temperature [K]  Process temperature [K]  Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Tool Wear Failure [TWF]  Heat Dissipation Failure [HDF]  Power Failure [PWF]  Overstrain Failure [OSF]  Random Failure [RNF]  Type_H  Type_L  Type_M  Machine failure
0   0                300.6                    309.6                    1596         36.1              140                        0                               0                    0                         0                     0   False    True   False                0
1   1                302.6                    312.1                    1759         29.1              200                        0                               0                    0                         0                     0   False   False    True                0
2   2                299.3                    308.5                    1802         26.5               25                        0                               0                   

In [21]:
X = data.drop('Machine failure', axis=1)  
y = data['Machine failure']

X.columns = X.columns.astype(str)  
X.columns = X.columns.str.replace(r'[\[\]<]', '', regex=True)
X.columns = X.columns.str.replace(' ', '_')


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
#print(data.head())

### Проверим сразу несколько, чтобы удобно было сравнить


In [24]:
# Список моделей
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42)
}

#### Сравнение моделей

In [25]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": report['weighted avg']['precision'],
        "Recall": report['weighted avg']['recall'],
        "F1-Score": report['weighted avg']['f1-score']
    })

[LightGBM] [Info] Number of positive: 107425, number of negative: 107424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1552
[LightGBM] [Info] Number of data points in the train set: 214849, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initscore=0.000009
[LightGBM] [Info] Start training from score 0.000009


#### Добавление ансамбля моделей

In [26]:
model1 = LogisticRegression(random_state=42)
model2 = RandomForestClassifier(random_state=42)
model3 = XGBClassifier(eval_metric='logloss', random_state=42)

ensemble = VotingClassifier(estimators=[
    ('lr', model1), ('rf', model2), ('xgb', model3)], voting='soft')
ensemble.fit(X_train, y_train)

# Прогнозирование ансамбля
y_pred_ensemble = ensemble.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_report = classification_report(y_test, y_pred_ensemble, output_dict=True)
results.append({
    "Model": "Voting Ensemble",
    "Accuracy": ensemble_accuracy,
    "Precision": ensemble_report['weighted avg']['precision'],
    "Recall": ensemble_report['weighted avg']['recall'],
    "F1-Score": ensemble_report['weighted avg']['f1-score']
})

In [27]:
bagging_model = BaggingClassifier(estimator=RandomForestClassifier(random_state=42), n_estimators=50, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, y_pred_bagging)
bagging_report = classification_report(y_test, y_pred_bagging, output_dict=True)
results.append({
    "Model": "Bagging",
    "Accuracy": bagging_accuracy,
    "Precision": bagging_report['weighted avg']['precision'],
    "Recall": bagging_report['weighted avg']['recall'],
    "F1-Score": bagging_report['weighted avg']['f1-score']
})

In [28]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_report = classification_report(y_test, y_pred_gb, output_dict=True)
results.append({
    "Model": "Gradient Boosting",
    "Accuracy": gb_accuracy,
    "Precision": gb_report['weighted avg']['precision'],
    "Recall": gb_report['weighted avg']['recall'],
    "F1-Score": gb_report['weighted avg']['f1-score']
})

In [29]:
stacking_model = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42)),
        ('lgbm', LGBMClassifier(random_state=42))
    ],
    final_estimator=LogisticRegression(random_state=42)
)
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_report = classification_report(y_test, y_pred_stacking, output_dict=True)
results.append({
    "Model": "Stacking",
    "Accuracy": stacking_accuracy,
    "Precision": stacking_report['weighted avg']['precision'],
    "Recall": stacking_report['weighted avg']['recall'],
    "F1-Score": stacking_report['weighted avg']['f1-score']
})

[LightGBM] [Info] Number of positive: 107425, number of negative: 107424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004672 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1552
[LightGBM] [Info] Number of data points in the train set: 214849, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500002 -> initscore=0.000009
[LightGBM] [Info] Start training from score 0.000009
[LightGBM] [Info] Number of positive: 85940, number of negative: 85939
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1551
[LightGBM] [Info] Number of data points in the train set: 171879, number of used features: 14
[LightGBM] [Info

In [30]:
adaboost_model = AdaBoostClassifier(estimator=RandomForestClassifier(random_state=42), n_estimators=50, random_state=42)
adaboost_model.fit(X_train, y_train)
y_pred_adaboost = adaboost_model.predict(X_test)
adaboost_accuracy = accuracy_score(y_test, y_pred_adaboost)
adaboost_report = classification_report(y_test, y_pred_adaboost, output_dict=True)
results.append({
    "Model": "AdaBoost",
    "Accuracy": adaboost_accuracy,
    "Precision": adaboost_report['weighted avg']['precision'],
    "Recall": adaboost_report['weighted avg']['recall'],
    "F1-Score": adaboost_report['weighted avg']['f1-score']
})



In [31]:
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score
0        Random Forest  0.970622   0.970630  0.970622  0.970621
1              XGBoost  0.973712   0.973836  0.973712  0.973710
2             LightGBM  0.978199   0.978303  0.978199  0.978198
3  Logistic Regression  0.881388   0.882379  0.881388  0.881311
4                  SVM  0.900061   0.901690  0.900061  0.899960
5      Voting Ensemble  0.965800   0.965893  0.965800  0.965798
6              Bagging  0.963696   0.963710  0.963696  0.963696
7    Gradient Boosting  0.908774   0.909335  0.908774  0.908743
8             Stacking  0.982388   0.982389  0.982388  0.982388
9             AdaBoost  0.970659   0.970669  0.970659  0.970659


### 1. **Лучшие результаты**  
- **Stacking** показал **наилучший результат** по всем метрикам (Accuracy = 0.982388, Precision = 0.982389, Recall = 0.982388, F1-Score = 0.982388).  
  Ансамблирование методом Stacking эффективно использует сильные стороны отдельных моделей.  

### 2. **Высокая производительность**  
Следующие модели продемонстрировали близкие к оптимальным результаты:  
- **LightGBM** (Accuracy = 0.978199, F1-Score = 0.978198)  
- **XGBoost** (Accuracy = 0.973712, F1-Score = 0.973710)  
- **Random Forest** (Accuracy = 0.970622, F1-Score = 0.970621)  
- **AdaBoost** (Accuracy = 0.970659, F1-Score = 0.970659)

**Вывод**: Ансамблирование методом бустинга (XGBoost, LightGBM, AdaBoost) и случайные леса (Random Forest) дают стабильные и высокие результаты.