In [27]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [2]:
df_deviatoric = pd.read_csv("data/deviatoric.csv")
df_deviatoric_nsw = pd.read_csv("data/deviatoric_nsw.csv")
df_total = pd.read_csv("data/total.csv")
df_total_nsw = pd.read_csv("data/total_nsw.csv")

In [4]:
df_total.columns

Index(['velocity:0', 'velocity:1', 'force:0', 'force:1',
       'total displacement:0', 'total displacement:1', 'temperature',
       'temp_power', 'temp_pressure', 'temp_density', 'plastic strain',
       'plastic strain-rate', 'strain-rate II log10', 'strain-rate XX',
       'elastic_strain XX', 'strain-rate ZZ', 'elastic_strain ZZ',
       'strain-rate XZ', 'elastic_strain XZ', 'strain I', 'strain II',
       'strain XX', 'strain ZZ', 'strain XZ', 'stress I', 'stress II',
       'stress XX', 'stress ZZ', 'stress XZ', 'density', 'thermal_stress',
       'energy_total_vol_dev', 'energy_volumetric', 'energy_deviatoric',
       'energy_thermal', 'energy_elastic', 'coordinate:0', 'coordinate:1'],
      dtype='object')

In [5]:
threshold = 0.15
df_deviatoric['plastic strain'] = np.where(df_deviatoric['plastic strain'] > threshold, 1,0)
df_deviatoric_nsw['plastic strain'] = np.where(df_deviatoric_nsw['plastic strain'] > threshold, 1,0)
df_total['plastic strain'] = np.where(df_total['plastic strain'] > threshold, 1,0)
df_total_nsw['plastic strain'] = np.where(df_total_nsw['plastic strain'] > threshold, 1,0)

In [6]:
def drop_and_split_coloumns(df):
    
    dropped_cols = ['plastic strain', 'plastic strain-rate', 'strain-rate II log10', 'strain-rate XX',
           'elastic_strain XX', 'strain-rate ZZ', 'elastic_strain ZZ',
           'strain-rate XZ', 'elastic_strain XZ', 'strain I', 'strain II',
           'strain XX', 'strain ZZ', 'strain XZ', 'coordinate:0', 'coordinate:1', 'force:0', 'force:1', 
           'total displacement:0', 'total displacement:1', 'stress I', 'stress II', 'energy_volumetric', 
           'energy_deviatoric','energy_thermal', 'energy_elastic', 'thermal_stress',
           'energy_total_vol_dev', 'temp_power', 'temp_pressure', 'temp_density']

    Y = df['plastic strain']
    df = df.drop(dropped_cols, axis=1)
    X = df
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)
    
    return X_train, X_test, y_train, y_test, df

In [7]:
X_train_dev, X_test_dev, Y_train_dev, Y_test_dev, df_deviatoric = drop_and_split_coloumns(df_deviatoric)
X_train_dev_nsw, X_test_dev_nsw, Y_train_dev_nsw, Y_test_dev_nsw, df_deviatoric_nsw = drop_and_split_coloumns(df_deviatoric_nsw)
X_train_tot, X_test_tot, Y_train_tot, Y_test_tot, df_total = drop_and_split_coloumns(df_total)
X_train_tot_nsw, X_test_tot_nsw, Y_train_tot_nsw, Y_test_tot_nsw, df_total_nsw = drop_and_split_coloumns(df_total_nsw)

In [8]:
xtrains = [X_train_dev, X_train_dev_nsw, X_train_tot, X_train_tot_nsw]
xtests = [X_test_dev, X_test_dev_nsw, X_test_tot, X_test_tot_nsw]
ytrains = [Y_train_dev, Y_train_dev_nsw, Y_train_tot, Y_train_tot_nsw]
ytests = [Y_test_dev, Y_test_dev_nsw, Y_test_tot, Y_test_tot_nsw]
models_name = ['Deviatoric', 'Deviatoric_nsw', 'Total', 'Total_nsw']

In [16]:
df_total.columns

Index(['velocity:0', 'velocity:1', 'temperature', 'stress XX', 'stress ZZ',
       'stress XZ', 'density'],
      dtype='object')

In [17]:
def scale_feature(features):    
    mms = StandardScaler()
    scaled_features = mms.fit_transform(features)
    return scaled_features, mms

In [18]:
for i in range(len(models_name)):
    xtrains[i] = scale_feature(xtrains[i])
    xtests[i] = scale_feature(xtests[i])

### Cross validation

In [23]:
class Create_ensemble(object):
    def __init__(self, model_name, n_splits, base_model):
        self.n_splits = n_splits
        self.base_model = base_model
        self.model_name = model_name
        
    def predict(self, X, y, x_test, y_test):
        X = np.array(X)
        y = np.array(y)
        x_test = np.array(x_test)
        acc_arry = np.zeros([self.n_splits, 2])

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2018).split(X, y))
        clf = self.base_model
        
        for j, (train_idx, valid_idx) in enumerate(folds):
            X_train = X[train_idx]
            y_train = y[train_idx]
            X_valid = X[valid_idx]
            y_valid = y[valid_idx]
            clf.fit(X_train, y_train)
            valid_pred = clf.predict(X_valid)
            test_pred = clf.predict(x_test)
            valid_accuracy = accuracy_score(y_valid, valid_pred)
            test_accuracy = accuracy_score(y_test, test_pred)
            acc_arry[j,:] = valid_accuracy, test_accuracy

            print('Model: {}, fold: {}, valid_acc: {},  test_acc: {}'.format(self.model_name, j, valid_accuracy, test_accuracy))
            
        return acc_arry

## Random forest model

In [28]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=6, verbose=0, warm_start=False)

In [29]:
n_splits = 5

accuracies = []

for i, model in enumerate(models_name):
    stack_dev = Create_ensemble(model, n_splits = n_splits, base_model = rfc)
    acc = stack_dev.predict(xtrains[i][0], ytrains[i], xtests[i][0],  ytests[i])
    accuracies.append(acc)
    print('\nFinised bulding model: {}\n'.format(model))

Model: Deviatoric, fold: 0, valid_acc: 0.9774074103908728,  test_acc: 0.9776141284734753
Model: Deviatoric, fold: 1, valid_acc: 0.9771415836293676,  test_acc: 0.9773735414411163
Model: Deviatoric, fold: 2, valid_acc: 0.9771791140158208,  test_acc: 0.9773434680620715
Model: Deviatoric, fold: 3, valid_acc: 0.9771254114955615,  test_acc: 0.9774913288423754
Model: Deviatoric, fold: 4, valid_acc: 0.9775604019096616,  test_acc: 0.9773873250731786

Finised bulding model: Deviatoric

Model: Deviatoric_nsw, fold: 0, valid_acc: 0.983472023462483,  test_acc: 0.983708064343252
Model: Deviatoric_nsw, fold: 1, valid_acc: 0.9836043851202728,  test_acc: 0.9833164312745801
Model: Deviatoric_nsw, fold: 2, valid_acc: 0.9842139203717282,  test_acc: 0.9841285545854048
Model: Deviatoric_nsw, fold: 3, valid_acc: 0.9848234556231835,  test_acc: 0.9842316159192659
Model: Deviatoric_nsw, fold: 4, valid_acc: 0.9839047358238885,  test_acc: 0.9830319819931237

Finised bulding model: Deviatoric_nsw

Model: Total, fo

### Build models with full datasets

In [45]:
def plot_important_features(model_name, df, model):
    values = sorted(zip(df.columns, model.feature_importances_), key=lambda x: x[1] * -1)
    sorted_features, scores = zip(*values)

    trace0 = go.Bar(
        x= sorted_features,
        y= scores*100,
    #     text=['27% market share', '24% market share', '19% market share'],
        marker=dict(
            color='rgb(158,202,225)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5,
            )
        ),
        opacity=0.6
    )

    data = [trace0]
    layout = go.Layout(
        title= 'Important features of model: '+ str(model_name),
    )

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='text-hover-bar')

In [46]:
models = []
ypreds = []
for i, model_name in enumerate(models_name):
    model = rfc.fit(xtrains[i][0], ytrains[i])
    models.append(model)
    ypreds.append(models[i].predict(xtests[i][0]))
    plot_important_features(model_name, df_deviatoric, model)
    print('Finished building model: {}'.format(model_name))

Finished building model: Deviatoric


Finished building model: Deviatoric_nsw


Finished building model: Total


Finished building model: Total_nsw


## Classification report

In [47]:
for i, model_name in enumerate(models_name):
    print('Model name: {}\n'.format(model_name))
    print('Accuracy of the model: {}'.format(accuracy_score(ytests[i], ypreds[i])))
    print('Classification report \n {}'.format(classification_report(ytests[i], ypreds[i])))
    print('-----------------------------------\n')

Model name: Deviatoric

Accuracy of the model: 0.9774111331649225
Classification report 
              precision    recall  f1-score   support

          0       0.99      0.96      0.97    351678
          1       0.97      0.99      0.98    446370

avg / total       0.98      0.98      0.98    798048

-----------------------------------

Model name: Deviatoric_nsw

Accuracy of the model: 0.9832422271142002
Classification report 
              precision    recall  f1-score   support

          0       1.00      0.98      0.99    153469
          1       0.96      1.00      0.98     89105

avg / total       0.98      0.98      0.98    242574

-----------------------------------

Model name: Total

Accuracy of the model: 0.9787884319573487
Classification report 
              precision    recall  f1-score   support

          0       0.99      0.96      0.98    336591
          1       0.97      0.99      0.98    408427

avg / total       0.98      0.98      0.98    745018

------------