In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split as tts

#选取需要的数据（特定SMART号，特定机型）
def data_selection(filespath, features, model):
    
    #导入文件夹里所有csv文件,按日期排序
    all_files = sorted(glob.glob(filespath + "/*.csv"))
    li = []
    columns = []

    #选取有用的几列
    for f in features:
        columns += ["smart_{}_raw".format(f)]
    columns = ["date", 'serial_number', "failure"] + columns
    
    #拼接在一起，每个文件的第一行作为title
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        df = df[df.model == model]
        df = df[columns]
        li.append(df)

    #根据日期排序, 重设index
    frame = pd.concat(li, axis=0, ignore_index=True).reset_index(drop=True)
    
    return frame

In [14]:
#平衡数据
def data_clean(n_day, frame):
    
    #找出故障发生当天的机器
    fail1 = frame[frame['failure'] == 1]

    #将机子出故障前n天的failure都变成1
    for s_num in np.array(fail1['serial_number']):
        r = n_day
        for row in frame.itertuples(): #最快的遍历办法
            if frame.at[row.Index,'serial_number'] == s_num: 
                frame.at[row.Index,'failure'] = 1
                r -= 1 
            if r < 0: 
                break

    fail = frame[frame['failure'] == 1]
    #随机选择使正常运行的案例和失败的一样多
    succ = frame[frame['failure'] == 0].sample(n=len(fail.index))
    result = pd.concat([fail, succ])
    result = result.sort_values(['date'], ignore_index=True)
    return result

In [None]:
#操作的文件夹（在2016年第一个季度上训练，优化模型）
filespath = r'/Users/penghanqiu/Desktop/data_Q1_2016'
#特定SMART号
features = [5, 187, 188, 197, 198]
#特定硬盘型号
model = "ST4000DM000"
#向前检查一周，试过3天，%95.8->%99
pre_day = 7

#选取需要的数据
dframe = data_selection(filespath, features, model)[::-1]

dframe = data_clean(pre_day, dframe)

#获得平衡数据集(耗时20min)
dframe.to_csv('clean_data1.csv', encoding='utf-8', index=False)

In [15]:
#在新的平衡数据集上训练测试
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split as tts

filepath = r'/Users/penghanqiu/Desktop/clean_data1.csv'

df = pd.read_csv (filepath)
df = df.fillna(0)
features = [5, 187, 188, 197, 198]
columns = []
for f in features:
    columns += ["smart_{}_raw".format(f)]
X = df[columns].values
y = df['failure'].values

In [16]:
#print出易懂的混淆矩阵:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    print('PREDICTED:')
    print("     " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    print('ACTUAL: ')
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

In [17]:
# GBDT 单次检验
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

my_classifier = GradientBoostingClassifier(learning_rate=0.06)
 
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state = 98)

#用训练集的数据标准化
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)

#测试集也用训练集得到的参数标准化（无偏）
X_test_scaled = scaler.transform(X_test)

my_classifier.fit(X_train_scaled, y_train)

y_pred = my_classifier.predict(X_test_scaled)

print("平衡训练集上准确率:", my_classifier.score(X_train_scaled, y_train))
print("平衡测试集上准确率:", my_classifier.score(X_test_scaled, y_test))
print("\n")

#第一行第一列是TP，第一行第二列是FN, 第二行第一列是FP
confMat = confusion_matrix(y_test, y_pred,labels=[1,0]) 

#混淆矩阵
print_cm(confMat, ["Failed", "Running"])

平衡训练集上准确率: 0.7651483493522775
平衡测试集上准确率: 0.7612687813021702


            PREDICTED:
              Failed Running 
ACTUAL: 
     Failed   156.0   143.0 
    Running     0.0   300.0 


In [18]:
# k-fold交叉验证: 选择最优模型 (GBDT)

#from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
#from sklearn.experimental import enable_hist_gradient_boosting
#from sklearn.ensemble import HistGradientBoostingClassifier #基于LightGBM

def train_test_classifier(my_classifier, k_fold):
    precision, recall, test_acc, train_acc, f1 = ([] for _ in range(5))
    for k in range(1, k_fold+1):

        X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = k * 10)

        #用训练集的数据标准化
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)

        #测试集也用训练集得到的参数标准化（无偏）
        X_test_scaled = scaler.transform(X_test)

        my_classifier.fit(X_train_scaled, y_train)

        y_pred_test = my_classifier.predict(X_test_scaled)
        y_pred_train = my_classifier.predict(X_train_scaled)
        
        precision += [precision_score(y_test, y_pred_test)] #TP/(TP + FP)
        recall += [recall_score(y_test, y_pred_test)] #TP/(TP + FN)
        test_acc += [accuracy_score(y_test, y_pred_test)] #(TP+TN)/(TP+TN+FP+ FN)
        train_acc += [accuracy_score(y_train, y_pred_train)] 
        f1 += [f1_score(y_test, y_pred_test)] #2 * (precision * recall) / (precision + recall)

    return sum(precision)/k_fold, sum(recall)/k_fold, sum(test_acc)/k_fold, sum(train_acc)/k_fold, sum(f1)/k_fold

#my_classifier2 = GaussianNB()
#my_classifier3 = RandomForestClassifier()
my_classifier1 = GradientBoostingClassifier(learning_rate=0.06)
#my_classifier4 = HistGradientBoostingClassifier()
k_fold = 10

precision, recall, test_acc, train_acc, f1 = [0 for _ in range(5)]

precision, recall, test_acc, train_acc, f1 = train_test_classifier(my_classifier1, k_fold)

print("处理后的平衡数据集里:")
print("Precision:", precision, "\n" "recall:", recall, "\n" "Test Accuracy:",
      test_acc,"\n" "Training Accuracy:", train_acc,"\n" "F1 Score:",f1)

处理后的平衡数据集里:
Precision: 0.9643695291986862 
recall: 0.5380592323399906 
Test Accuracy: 0.7564273789649415 
Training Accuracy: 0.7659841203510237 
F1 Score: 0.6901928247456602


In [19]:
#Baseline检测方法: When the RAW value for one of these five attributes is greater than zero, we predict a hard drive will fail.
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
def baseline_classifier(X,y):
    y_pred_base = []

    for arr in X:
        if all(i == 0 for i in arr):
            y_pred_base += [0]
        else:
            y_pred_base += [1]

    precision = precision_score(y, y_pred_base) #TP/(TP + FP)
    recall = recall_score(y, y_pred_base) #TP/(TP + FN)
    acc = accuracy_score(y, y_pred_base) #(TP+TN)/(TP+TN+FP+ FN)
    f1 = f1_score(y, y_pred_base) #2 * (precision * recall) / (precision + recall)

    #第一行第一列是TP，第一行第二列是FN, 第二行第一列是FP
    confMat = confusion_matrix(y, y_pred_base,labels=[1,0]) 
    
    return precision, recall, acc, f1, confMat 

In [20]:
#在2016年第二个季度上测试模型
filespath1 = r'/Users/penghanqiu/Desktop/data_Q2_2016'
#特定SMART号
features = [5, 187, 188, 197, 198]
#特定硬盘型号
model = "ST4000DM000"

#选取需要的数据
dframe = data_selection(filespath1, features, model)

In [21]:
df = dframe.fillna(0)
features = [5, 187, 188, 197, 198]
columns = []
for f in features:
    columns += ["smart_{}_raw".format(f)]
X_raw = df[columns].values
y_raw = df['failure'].values

preci_base, recall_base, acc_base, f1_base, confMat_base = baseline_classifier(X_raw,y_raw)
print("原始数据集里的Baseline查准率:", preci_base)
print("原始数据集里的Baseline召回率:", recall_base)
print("原始数据集里的Baseline准确率:", acc_base)
print("原始数据集里的Baseline F1值:", f1_base)
print("\n")
#混淆矩阵
print_cm(confMat_base, ["Failed", "Running"])

原始数据集里的Baseline查准率: 0.0009105323474465848
原始数据集里的Baseline召回率: 0.7631578947368421
原始数据集里的Baseline准确率: 0.9389188290192892
原始数据集里的Baseline F1值: 0.0018188945511564095


            PREDICTED:
              Failed Running 
ACTUAL: 
     Failed   174.0    54.0 
    Running 190923.0 2935459.0 


In [22]:
#用GBDT单次检验时获得的模型

#RobustScaler()标准化，减少outlier的影响
X_scaled = scaler.transform(X_raw)

y_pred_raw = my_classifier.predict(X_scaled)

#第一行第一列是TP，第一行第二列是FN, 第二行第一列是FP
confMat_raw1 = confusion_matrix(y_raw, y_pred_raw,labels=[1, 0])
#混淆矩阵
print_cm(confMat_raw1, ["Failed", "Running"])
print("\n")

precision1 = confMat_raw1[0][0]/(confMat_raw1[0][0]+confMat_raw1[1][0])
recall1 = confMat_raw1[0][0]/(confMat_raw1[0][0]+confMat_raw1[0][1])
f_1 = 2 * (precision1*recall1)/(precision1+recall1)

print("原始数据集里的GBDT的查准率:", precision1) #TP/(TP + FP)
print("原始数据集里的GBDT的召回率:", recall1) #TP/(TP + FN)
print("模型在原始测试数据集里的准确率:", my_classifier.score(X_scaled, y_raw)) 
print("原始数据集里的GBDT的F1值", f_1)#2 * (precision * recall) / (precision + recall)

            PREDICTED:
              Failed Running 
ACTUAL: 
     Failed   166.0    62.0 
    Running 31540.0 3094842.0 


原始数据集里的GBDT的查准率: 0.005235602094240838
原始数据集里的GBDT的召回率: 0.7280701754385965
模型在原始测试数据集里的准确率: 0.9898925673493016
原始数据集里的GBDT的F1值 0.010396442662992424
