In [1]:
'''
Editor: @Hyunhomo
Source: https://github.com/ashishpatel26/Predictive_Maintenance_using_Machine-Learning_Microsoft_Casestudy

Load merged&labeled features and train machine learning model
'''
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import GradientBoostingClassifier

## Load features from csv files
final_feat = pd.read_csv('final_feat.csv')
labeled_features = pd.read_csv('labeled_features.csv')
final_feat['datetime'] = pd.to_datetime(final_feat['datetime'], format="%Y-%m-%d %H:%M:%S")
labeled_features['datetime'] = pd.to_datetime(labeled_features['datetime'], format="%Y-%m-%d %H:%M:%S")

# make test and training splits
threshold_dates = [[pd.to_datetime('2015-07-31 01:00:00'), pd.to_datetime('2015-08-01 01:00:00')],
                   [pd.to_datetime('2015-08-31 01:00:00'), pd.to_datetime('2015-09-01 01:00:00')],
                   [pd.to_datetime('2015-09-30 01:00:00'), pd.to_datetime('2015-10-01 01:00:00')]]

test_results = []
models = []
for last_train_date, first_test_date in threshold_dates:
    print ("last_train_date", last_train_date)
    print ("first_test_date", first_test_date)
    # split out training and test data
    train_y = labeled_features.loc[labeled_features['datetime'] < last_train_date, 'failure']
    print ("train_y", train_y)
    train_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] < last_train_date].drop(['datetime',
                                                                                                        'machineID',
                                                                                                        'failure'], 1))

    print ("train_X", train_X)
    test_X = pd.get_dummies(labeled_features.loc[labeled_features['datetime'] > first_test_date].drop(['datetime',
                                                                                                       'machineID',
                                                                                                       'failure'], 1))


# train and predict using the model, storing results for later
filename = 'GBC_pdm_model.sav'
try:
    f = open(filename)
    # Do something with the file
    print("Trained model already exists")
    my_model = pickle.load(open(filename, 'rb'))

except IOError:
    # print("File not accessible")
    my_model = GradientBoostingClassifier(random_state=42, learning_rate=0.01, n_estimators=100, verbose=1)
    print("Training in progress...")
    my_model.fit(train_X, train_y)
    # save trained model
    pickle.dump(my_model, open(filename, 'wb'))



last_train_date 2015-07-31 01:00:00
first_test_date 2015-08-01 01:00:00
train_y 0         none
1         none
2         none
3         none
4         none
          ... 
290102    none
290103    none
290104    none
290105    none
290106    none
Name: failure, Length: 167922, dtype: object
train_X         Unnamed: 0  voltmean_3h  rotatemean_3h  pressuremean_3h  \
0                0   180.133784     440.608320        94.137969   
1                1   176.364293     439.349655       101.553209   
2                2   160.384568     424.385316        99.598722   
3                3   170.472461     442.933997       102.380586   
4                4   163.263806     468.937558       102.726648   
...            ...          ...            ...              ...   
290102      290102   181.789600     443.816655        95.819894   
290103      290103   173.083263     438.704284        94.424390   
290104      290104   174.048390     433.016353        99.925802   
290105      290105   167.883990 

train_X         Unnamed: 0  voltmean_3h  rotatemean_3h  pressuremean_3h  \
0                0   180.133784     440.608320        94.137969   
1                1   176.364293     439.349655       101.553209   
2                2   160.384568     424.385316        99.598722   
3                3   170.472461     442.933997       102.380586   
4                4   163.263806     468.937558       102.726648   
...            ...          ...            ...              ...   
290590      290590   155.079380     450.187759       101.838938   
290591      290591   185.390529     435.256907        92.831119   
290592      290592   166.113912     444.157407       103.054512   
290593      290593   157.015479     468.556698       106.294096   
290594      290594   181.756173     417.257999       103.719925   

        vibrationmean_3h  voltsd_3h  rotatesd_3h  pressuresd_3h  \
0              41.551544  21.322735    48.770512       2.135684   
1              36.105580  18.952210    51.329636     