In [14]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [2]:
df = pd.read_csv("data/deviatoric_concatenate.csv")

In [3]:
df.describe()

Unnamed: 0,velocity:0,velocity:1,force:0,force:1,total displacement:0,total displacement:1,temperature,temp_power,temp_pressure,temp_density,...,stress XZ,density,thermal_stress,energy_total_vol_dev,energy_volumetric,energy_deviatoric,energy_thermal,energy_elastic,coordinate:0,coordinate:1
count,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,...,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0,2660160.0
mean,6.329685e-11,3.727487e-11,-13672160.0,-7402520.0,1453.043,3305.589,6.158631,46090.19,-39.74861,3.91331e-06,...,1629498.0,2405.16,-9380253.0,272299500.0,-5729299.0,289487400.0,1365979000000.0,66630740000.0,52387.03,-2527.535
std,1.550915e-10,8.895128e-11,2029575000.0,2357488000.0,9714.274,3906.848,8.942963,565785.2,29204.73,0.0002309945,...,17221220.0,336.8318,13644520.0,399908200.0,9904921.0,422014800.0,7914640000000.0,790941200000.0,21757.35,3923.942
min,-1.216e-09,-9.8967e-10,-142280000000.0,-109510000000.0,-20226.0,-1443.7,-1.7224e-07,0.0,-14533000.0,-0.13188,...,-150860000.0,1231.5,-76506000.0,0.0,-45031000.0,0.0,0.0,-827580000000.0,-19950.0,-10100.0
25%,-8.732425e-11,-5.109225e-12,-233660.0,-196970.0,-5646.9,23.27675,0.3263,0.0,-65.87,-1.2871e-07,...,-5402000.0,2291.9,-13474000.0,778290.0,-4502800.0,972697.5,7997400000.0,-584460000.0,37918.0,-5496.6
50%,1.2345e-10,4.7165e-12,38000.0,26657.0,1715.3,1877.3,1.3905,0.0,0.134715,4.3244e-15,...,537110.0,2472.9,-2167850.0,53331000.0,-813690.0,58533000.0,99854000000.0,319830000.0,50942.0,-2150.75
75%,1.7655e-10,7.3299e-11,366740.0,274550.0,9275.5,5382.1,8.9412,0.0,88.853,1.5096e-07,...,7358400.0,2712.9,-562810.0,440730000.0,-16290.0,465800000.0,728122500000.0,6932400000.0,64243.0,670.26
max,1.2452e-09,7.5883e-10,204180000000.0,135420000000.0,20013.0,15378.0,52.578,124030000.0,15188000.0,0.10495,...,152230000.0,2726.4,1209600.0,1972000000.0,868290.0,2023400000.0,743420000000000.0,68617000000000.0,119950.0,5642.0


In [4]:
df.columns

Index(['velocity:0', 'velocity:1', 'force:0', 'force:1',
       'total displacement:0', 'total displacement:1', 'temperature',
       'temp_power', 'temp_pressure', 'temp_density', 'plastic strain',
       'plastic strain-rate', 'strain-rate II log10', 'strain-rate XX',
       'elastic_strain XX', 'strain-rate ZZ', 'elastic_strain ZZ',
       'strain-rate XZ', 'elastic_strain XZ', 'strain I', 'strain II',
       'strain XX', 'strain ZZ', 'strain XZ', 'stress I', 'stress II',
       'stress XX', 'stress ZZ', 'stress XZ', 'density', 'thermal_stress',
       'energy_total_vol_dev', 'energy_volumetric', 'energy_deviatoric',
       'energy_thermal', 'energy_elastic', 'coordinate:0', 'coordinate:1'],
      dtype='object')

In [5]:
df.describe()['plastic strain']

count    2.660160e+06
mean     3.095186e+00
std      4.112005e+00
min      0.000000e+00
25%      4.768800e-03
50%      3.100500e-01
75%      5.900300e+00
max      2.176800e+01
Name: plastic strain, dtype: float64

In [6]:
threshold = 0.15
df['plastic strain'] = np.where(df['plastic strain'] > threshold, 1,0)

In [7]:
dropped_cols = ['plastic strain', 'plastic strain-rate', 'strain-rate II log10', 'strain-rate XX',
       'elastic_strain XX', 'strain-rate ZZ', 'elastic_strain ZZ',
       'strain-rate XZ', 'elastic_strain XZ', 'strain I', 'strain II',
       'strain XX', 'strain ZZ', 'strain XZ', 'coordinate:0', 'coordinate:1', 'force:0', 'force:1', 
       'total displacement:0', 'total displacement:1', 'stress I', 'stress II', 'energy_volumetric', 
       'energy_deviatoric','energy_thermal', 'energy_elastic', 'thermal_stress',
       'energy_total_vol_dev', 'temp_power', 'temp_pressure', 'temp_density']

Y = df['plastic strain']
df = df.drop(dropped_cols, axis=1)
X = df

In [8]:
df.columns

Index(['velocity:0', 'velocity:1', 'temperature', 'stress XX', 'stress ZZ',
       'stress XZ', 'density'],
      dtype='object')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [10]:
mms = StandardScaler()
X_train = mms.fit_transform(X_train)
X_test = mms.fit_transform(X_test)

In [11]:
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models
        
    def predict(self, X, y, x_test, y_test):
        X = np.array(X)
        y = np.array(y)
        x_test = np.array(x_test)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((x_test.shape[0], len(self.base_models)))
        
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((x_test.shape[0], self.n_splits))

            for j, (train_idx, valid_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_valid = X[valid_idx]
                y_valid = y[valid_idx]
                clf.fit(X_train, y_train)
                valid_pred = clf.predict(X_valid)
                S_train[valid_idx, i] = valid_pred
                test_pred = clf.predict(x_test)
                S_test_i[:, j] = test_pred
                
                print('Model: {}, fold: {}, train_acc: {},  test_acc: {}'.format(i, j, accuracy_score(y_valid, valid_pred), accuracy_score(y_test, test_pred)))
                
            print( "\nTraining RMSLE for model {} : {}".format(i, accuracy_score(y, S_train[:,i])))
            S_test[:, i] = S_test_i.mean(axis=1)
            
        return S_train, S_test, clf

In [15]:
rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=6, verbose=0, warm_start=False)

In [16]:
stack = Create_ensemble(n_splits = 10, base_models = [rfc])

X = X_train
Y = y_train

train_pred, test_pred, model = stack.predict(X, Y, X_test, y_test)

Model: 0, fold: 0, train_acc: 0.9769402616372737,  test_acc: 0.9774725329804723
Model: 0, fold: 1, train_acc: 0.9771282194488002,  test_acc: 0.9773735414411163
Model: 0, fold: 2, train_acc: 0.9774073497269227,  test_acc: 0.9774549901760295
Model: 0, fold: 3, train_acc: 0.9774019794748967,  test_acc: 0.9775063655318978
Model: 0, fold: 4, train_acc: 0.9771656883857559,  test_acc: 0.9774098801074622
Model: 0, fold: 5, train_acc: 0.9773912389708449,  test_acc: 0.977408627050002
Model: 0, fold: 6, train_acc: 0.9772730934262744,  test_acc: 0.977513883876659
Model: 0, fold: 7, train_acc: 0.9768810650283818,  test_acc: 0.9775264144512611
Model: 0, fold: 8, train_acc: 0.9773053149384301,  test_acc: 0.9773547455792133
Model: 0, fold: 9, train_acc: 0.9780141882058525,  test_acc: 0.9775001002445968

Training RMSLE for model 0 : 0.977290839648743


In [17]:
Y_pred = model.predict(X_test)

In [18]:
print('The accuracy of the model is {}'.format(accuracy_score(y_test, Y_pred)))
print('# Classification report \n {}'.format(classification_report(y_test, Y_pred)))

The accuracy of the model is 0.9775001002445968
# Classification report 
              precision    recall  f1-score   support

          0       0.99      0.96      0.97    351678
          1       0.97      0.99      0.98    446370

avg / total       0.98      0.98      0.98    798048



In [19]:
def plot_important_features(df, model):
    values = sorted(zip(df.columns, model.feature_importances_), key=lambda x: x[1] * -1)
    sorted_features, scores = zip(*values)

    trace0 = go.Bar(
        x= sorted_features,
        y= scores*100,
    #     text=['27% market share', '24% market share', '19% market share'],
        marker=dict(
            color='rgb(158,202,225)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5,
            )
        ),
        opacity=0.6
    )

    data = [trace0]
    layout = go.Layout(
        title='Important features',
    )

    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename='text-hover-bar')

In [20]:
plot_important_features(df, model)