In [14]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [2]:
df_deviatoric = pd.read_csv("data/deviatoric.csv")
df_deviatoric_nsw = pd.read_csv("data/deviatoric.csv")
df_total = pd.read_csv("data/deviatoric.csv")
df_total_nsw = pd.read_csv("data/deviatoric.csv")

In [4]:
df_deviatoric.columns

Index(['velocity:0', 'velocity:1', 'force:0', 'force:1',
       'total displacement:0', 'total displacement:1', 'temperature',
       'temp_power', 'temp_pressure', 'temp_density', 'plastic strain',
       'plastic strain-rate', 'strain-rate II log10', 'strain-rate XX',
       'elastic_strain XX', 'strain-rate ZZ', 'elastic_strain ZZ',
       'strain-rate XZ', 'elastic_strain XZ', 'strain I', 'strain II',
       'strain XX', 'strain ZZ', 'strain XZ', 'stress I', 'stress II',
       'stress XX', 'stress ZZ', 'stress XZ', 'density', 'thermal_stress',
       'energy_total_vol_dev', 'energy_volumetric', 'energy_deviatoric',
       'energy_thermal', 'energy_elastic', 'coordinate:0', 'coordinate:1'],
      dtype='object')

In [6]:
threshold = 0.15
df_deviatoric['plastic strain'] = np.where(df_deviatoric['plastic strain'] > threshold, 1,0)
df_deviatoric_nsw['plastic strain'] = np.where(df_deviatoric_nsw['plastic strain'] > threshold, 1,0)
df_total['plastic strain'] = np.where(df_total['plastic strain'] > threshold, 1,0)
df_total_nsw['plastic strain'] = np.where(df_total_nsw['plastic strain'] > threshold, 1,0)

In [7]:
def drop_and_split_coloumns(df):
    dropped_cols = ['plastic strain', 'plastic strain-rate', 'strain-rate II log10', 'strain-rate XX',
           'elastic_strain XX', 'strain-rate ZZ', 'elastic_strain ZZ',
           'strain-rate XZ', 'elastic_strain XZ', 'strain I', 'strain II',
           'strain XX', 'strain ZZ', 'strain XZ', 'coordinate:0', 'coordinate:1', 'force:0', 'force:1', 
           'total displacement:0', 'total displacement:1', 'stress I', 'stress II', 'energy_volumetric', 
           'energy_deviatoric','energy_thermal', 'energy_elastic', 'thermal_stress',
           'energy_total_vol_dev', 'temp_power', 'temp_pressure', 'temp_density']

    Y = df['plastic strain']
    df = df.drop(dropped_cols, axis=1)
    X = df
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)
    
    return X_train, X_test, y_train, y_test, df

In [None]:
X_train_dev, X_test_dev, Y_train_dev, Y_test_dev, df_deviatoric = drop_and_split_coloumns(df_deviatoric)
X_train_dev_nsw, X_test_dev_nsw, Y_train_dev_nsw, Y_test_dev_nsw, df_deviatoric_nsw = drop_and_split_coloumns(df_deviatoric_nsw)
X_train_tot, X_test_tot, Y_train_tot, Y_test_tot, df_total = drop_and_split_coloumns(df_total)
X_train_tot_nsw, X_test_tot_nsw, Y_train_tot_nsw, Y_test_tot_nsw, df_total_nsw = drop_and_split_coloumns(df_total_nsw)

In [8]:
df_deviatoric.columns

Index(['velocity:0', 'velocity:1', 'temperature', 'stress XX', 'stress ZZ',
       'stress XZ', 'density'],
      dtype='object')

In [10]:
def scale_feature(features):    
    mms = StandardScaler()
    scaled_features = mms.fit_transform(features)
    return scaled_features

In [None]:
X_train_dev = scale_feature(X_train_dev)
X_train_dev_nsw = scale_feature(X_train_dev_nsw)
X_train_tot = scale_feature(X_train_tot)
X_train_tot_nsw = scale_feature(X_train_tot_nsw)

# test features
X_test_dev = scale_feature(X_test_dev)
X_test_dev_nsw = scale_feature(X_test_dev_nsw)
X_test_tot = scale_feature(X_test_tot)
X_test_tot_nsw = scale_feature(X_test_tot_nsw)

In [11]:
class Create_ensemble(object):
    def __init__(self, model_name n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models
        self.model_name = model_name
        
    def predict(self, X, y, x_test, y_test):
        X = np.array(X)
        y = np.array(y)
        x_test = np.array(x_test)
        acc_arry = np.zeros([self.n_splits, 2])

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))
        clf = self.model_name
        
        for j, (train_idx, valid_idx) in enumerate(folds):
            X_train = X[train_idx]
            y_train = y[train_idx]
            X_valid = X[valid_idx]
            y_valid = y[valid_idx]
            clf.fit(X_train, y_train)
            valid_pred = clf.predict(X_valid)
            test_pred = clf.predict(x_test)
            valid_accuracy = accuracy_score(y_valid, valid_pred)
            test_accuracy = accuracy_score(y_test, test_pred)
            acc_arry[j,:] = valid_accuracy, test_accuracy

            print('Model: {}, fold: {}, valid_acc: {},  test_acc: {}'.format(i, j, valid_accuracy, test_accuracy))
            
        return acc_arry

In [15]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=6, verbose=0, warm_start=False)

In [16]:
models_name = ['Deviatoric', 'Deviatoric_nsw', 'Total', 'Total_nsw']
predictions = np.array()

for model in models:
    stack_dev = Create_ensemble(model, n_splits = 5, base_models = [rfc])
    train_pred_dev, test_pred_dev, model_dev = stack_dev.predict(X_train_dev, Y_train_dev, X_test_dev, Y_test_dev)

Model: 0, fold: 0, train_acc: 0.9769402616372737,  test_acc: 0.9774725329804723
Model: 0, fold: 1, train_acc: 0.9771282194488002,  test_acc: 0.9773735414411163
Model: 0, fold: 2, train_acc: 0.9774073497269227,  test_acc: 0.9774549901760295
Model: 0, fold: 3, train_acc: 0.9774019794748967,  test_acc: 0.9775063655318978
Model: 0, fold: 4, train_acc: 0.9771656883857559,  test_acc: 0.9774098801074622
Model: 0, fold: 5, train_acc: 0.9773912389708449,  test_acc: 0.977408627050002
Model: 0, fold: 6, train_acc: 0.9772730934262744,  test_acc: 0.977513883876659
Model: 0, fold: 7, train_acc: 0.9768810650283818,  test_acc: 0.9775264144512611
Model: 0, fold: 8, train_acc: 0.9773053149384301,  test_acc: 0.9773547455792133
Model: 0, fold: 9, train_acc: 0.9780141882058525,  test_acc: 0.9775001002445968

Training RMSLE for model 0 : 0.977290839648743


In [17]:
Y_pred = model.predict(X_test)

In [18]:
print('The accuracy of the model is {}'.format(accuracy_score(y_test, Y_pred)))
print('# Classification report \n {}'.format(classification_report(y_test, Y_pred)))

The accuracy of the model is 0.9775001002445968
# Classification report 
              precision    recall  f1-score   support

          0       0.99      0.96      0.97    351678
          1       0.97      0.99      0.98    446370

avg / total       0.98      0.98      0.98    798048



In [19]:
def plot_important_features(df, model):
    values = sorted(zip(df.columns, model.feature_importances_), key=lambda x: x[1] * -1)
    sorted_features, scores = zip(*values)

    trace0 = go.Bar(
        x= sorted_features,
        y= scores*100,
    #     text=['27% market share', '24% market share', '19% market share'],
        marker=dict(
            color='rgb(158,202,225)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5,
            )
        ),
        opacity=0.6
    )

    data = [trace0]
    layout = go.Layout(
        title='Important features',
    )

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='text-hover-bar')

In [20]:
plot_important_features(df, model)