In [3]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-macosx_10_15_x86_64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-macosx_10_15_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.5.0


In [7]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.3


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter

In [11]:
data = pd.read_csv('data_train_proc_FINAL.csv', sep=';') 

X = data.drop('Machine failure', axis=1)  
y = data['Machine failure']

print("До SMOTE:", Counter(y))

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("После SMOTE:", Counter(y_resampled))

X_resampled_df = pd.DataFrame(X_resampled)
y_resampled_df = pd.DataFrame(y_resampled, columns=['Machine failure'])

balanced_data = pd.concat([X_resampled_df, y_resampled_df], axis=1)
balanced_data.to_csv("balanced_dataset.csv", index=False)

До SMOTE: Counter({0: 134238, 1: 2122})
После SMOTE: Counter({0: 134238, 1: 134238})


In [13]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', 1900)

In [15]:
data = pd.read_csv('balanced_dataset.csv') 
print(data.head())

   Sum_Parameter  Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Air temperature [K]  Process temperature [K]  Machine failure
0              0                    1596         36.1              140                300.6                    309.6                0
1              0                    1759         29.1              200                302.6                    312.1                0
2              0                    1805         26.5               25                299.3                    308.5                0
3              0                    1524         44.3              197                301.0                    310.9                0
4              0                    1641         35.4               34                298.0                    309.0                0


In [17]:
X = data.drop('Machine failure', axis=1)  
y = data['Machine failure']

X.columns = X.columns.astype(str)  
X.columns = X.columns.str.replace(r'[\[\]<]', '', regex=True)
X.columns = X.columns.str.replace(' ', '_')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
# Проверим сразу несколько, чтобы удобно было сравнить
# Список моделей
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(random_state=42)
}

In [31]:
# Сравнение моделей
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": report['weighted avg']['precision'],
        "Recall": report['weighted avg']['recall'],
        "F1-Score": report['weighted avg']['f1-score']
    })

[LightGBM] [Info] Number of positive: 107390, number of negative: 107390
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1279
[LightGBM] [Info] Number of data points in the train set: 214780, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [33]:
# Добавление ансамбля моделей
model1 = LogisticRegression(random_state=42)
model2 = RandomForestClassifier(random_state=42)
model3 = XGBClassifier(eval_metric='logloss', random_state=42)

ensemble = VotingClassifier(estimators=[
    ('lr', model1), ('rf', model2), ('xgb', model3)], voting='soft')
ensemble.fit(X_train, y_train)

# Прогнозирование ансамбля
y_pred_ensemble = ensemble.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_report = classification_report(y_test, y_pred_ensemble, output_dict=True)
results.append({
    "Model": "Voting Ensemble",
    "Accuracy": ensemble_accuracy,
    "Precision": ensemble_report['weighted avg']['precision'],
    "Recall": ensemble_report['weighted avg']['recall'],
    "F1-Score": ensemble_report['weighted avg']['f1-score']
})

In [35]:
bagging_model = BaggingClassifier(estimator=RandomForestClassifier(random_state=42), n_estimators=50, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, y_pred_bagging)
bagging_report = classification_report(y_test, y_pred_bagging, output_dict=True)
results.append({
    "Model": "Bagging",
    "Accuracy": bagging_accuracy,
    "Precision": bagging_report['weighted avg']['precision'],
    "Recall": bagging_report['weighted avg']['recall'],
    "F1-Score": bagging_report['weighted avg']['f1-score']
})

In [37]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_report = classification_report(y_test, y_pred_gb, output_dict=True)
results.append({
    "Model": "Gradient Boosting",
    "Accuracy": gb_accuracy,
    "Precision": gb_report['weighted avg']['precision'],
    "Recall": gb_report['weighted avg']['recall'],
    "F1-Score": gb_report['weighted avg']['f1-score']
})

In [39]:
stacking_model = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42)),
        ('lgbm', LGBMClassifier(random_state=42))
    ],
    final_estimator=LogisticRegression(random_state=42)
)
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)
stacking_report = classification_report(y_test, y_pred_stacking, output_dict=True)
results.append({
    "Model": "Stacking",
    "Accuracy": stacking_accuracy,
    "Precision": stacking_report['weighted avg']['precision'],
    "Recall": stacking_report['weighted avg']['recall'],
    "F1-Score": stacking_report['weighted avg']['f1-score']
})

[LightGBM] [Info] Number of positive: 107390, number of negative: 107390
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1279
[LightGBM] [Info] Number of data points in the train set: 214780, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 85912, number of negative: 85912
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1279
[LightGBM] [Info] Number of data points in the train set: 171824, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0

In [41]:
adaboost_model = AdaBoostClassifier(estimator=RandomForestClassifier(random_state=42), n_estimators=50, random_state=42)
adaboost_model.fit(X_train, y_train)
y_pred_adaboost = adaboost_model.predict(X_test)
adaboost_accuracy = accuracy_score(y_test, y_pred_adaboost)
adaboost_report = classification_report(y_test, y_pred_adaboost, output_dict=True)
results.append({
    "Model": "AdaBoost",
    "Accuracy": adaboost_accuracy,
    "Precision": adaboost_report['weighted avg']['precision'],
    "Recall": adaboost_report['weighted avg']['recall'],
    "F1-Score": adaboost_report['weighted avg']['f1-score']
})



In [43]:
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score
0        Random Forest  0.987224   0.987229  0.987224  0.987224
1              XGBoost  0.978732   0.978909  0.978732  0.978730
2             LightGBM  0.981060   0.981204  0.981060  0.981059
3  Logistic Regression  0.856172   0.865172  0.856172  0.855280
4                  SVM  0.892338   0.897123  0.892338  0.892013
5      Voting Ensemble  0.979105   0.979279  0.979105  0.979103
6              Bagging  0.983779   0.983781  0.983779  0.983779
7    Gradient Boosting  0.912917   0.914811  0.912917  0.912818
8             Stacking  0.990986   0.990987  0.990986  0.990986
9             AdaBoost  0.988007   0.988010  0.988007  0.988007


In [45]:
# Так же наилучший результат показал Stacking