# XAI Neural Net Creation

## setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
currDir = os.path.dirname(os.path.realpath("__file__"))
rootDir = os.path.abspath(os.path.join(currDir, '..'))
sys.path.insert(1, rootDir)

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.utils import to_categorical

from src.data.make_dataset import make_dataset_1, make_dataset_2
from src.utils.file_utils import save_df, save_model

## generate artificial dataset

In [4]:
df1 = make_dataset_1()
df2 = make_dataset_2()
save_df(df1, "dataset1.csv")
save_df(df2, "dataset2.csv")

successfully generated dataset | num_rows: 2600000
successfully generated dataset | num_rows: 504000
df successfully saved | filename: dataset1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: dataset2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data


In [5]:
df2.describe()
df1.describe()

Unnamed: 0,mode,ei,to,td,tf,vers
count,2600000.0,2600000.0,2600000.0,2600000.0,2600000.0,2600000.0
mean,11.6,1.854442,267.7224,71.555,12.16666,4.5
std,8.002501,3.186181,1555.065,47.02838,11.25711,2.872282
min,5.0,0.347982,1.0,0.1,1.0,0.0
25%,5.0,0.933,3.0,34.3,3.968246,2.0
50%,6.5,1.354955,5.0,68.5,9.0,4.5
75%,21.0,1.354955,55.0,102.7,17.0,7.0
max,26.0,23.17735,14400.0,260.0,60.0,9.0


In [24]:
from src.utils.file_utils import load_df
df1, df2 = load_df("dataset1.csv"), load_df("dataset2.csv")

## create neural nets

In [6]:
# def build_model():    
#     model = Sequential()
#     model.add(Dense(6, input_dim=5, activation="relu"))#hidden layer
#     model.add(Dense(10, activation="relu"))#hidden layer
#     model.add(Dense(1, activation='sigmoid'))#output layer

#     optimizer = RMSprop(0.001)
#     model.compile(loss='mse', optimizer=optimizer, metrics=['mse', 'mae', 'mape'])
#     return model

def build_model(num_features, num_classes):
    model = Sequential()
    model.add(Dense(256, input_shape=(num_features,), activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', # Cross-entropy
                optimizer='adam', metrics=['accuracy'])
    return model

def build_model2(num_features, num_classes):
    model = Sequential()
    model.add(Dense(5, input_shape=(num_features,), activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', # Cross-entropy
                optimizer='adam', metrics=['accuracy'])
    return model

## train

In [8]:
def prepare_df(df, y_column):
    temp_df = df.copy()
    temp_df.sample(frac=1)
    
    y = to_categorical(
        temp_df[y_column].values)
    temp_df.drop(y_column, axis=1, inplace=True)
    features = list(temp_df.columns)
    
    x = temp_df.values
    scaler = StandardScaler()
    scaler.fit(x)
    x = scaler.transform(x)
    
    u = scaler.mean_
    s = scaler.scale_
    
    return x,y,features,u,s

x1, y1, features1,u1,s1 = prepare_df(df1, "vers")
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.20)
norm_train1, norm_test1 = pd.DataFrame(x_train1, columns=features1), pd.DataFrame(x_test1, columns=features1)


#Only save the train-test if they need to be updated. Be sure to also train the NN and save

save_df(norm_train1, "norm_train1.csv")
save_df(norm_test1, "norm_test1.csv")

x2, y2, features2,u2,s2 = prepare_df(df2, "vers")
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.20)
norm_train2, norm_test2 = pd.DataFrame(x_train2, columns=features2), pd.DataFrame(x_test2, columns=features2)
save_df(norm_train2, "norm_train2.csv")
save_df(norm_test2, "norm_test2.csv")

df successfully saved | filename: norm_train1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: norm_test1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: norm_train2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: norm_test2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data


In [9]:
print(features1)
print("means1:")
print(u1)
print("std1:")
print(s1)

print(features2)
print("means2:")
print(u2)
print("std2:")
print(s2)

['mode', 'ei', 'to', 'td', 'tf']
means1:
[ 11.6          1.85444168 267.72244306  71.555       12.16665978]
std1:
[   8.00249961    3.18618066 1555.06491616   47.02836883   11.25710397]
['mode', 'speed', 'fe', 'tt']
means2:
[  2.5         65.2602466    0.95246209 729.52536085]
std2:
[1.70782513e+00 1.92803439e+02 3.08688256e-01 2.32262065e+03]


In [10]:
model1 = build_model(5,10)
history1 = model1.fit(x_train1, y_train1, epochs=20, validation_split=0.2, batch_size=128, verbose=2)

Train on 1664000 samples, validate on 416000 samples
Epoch 1/20
1664000/1664000 - 28s - loss: 0.6232 - accuracy: 0.7485 - val_loss: 0.5101 - val_accuracy: 0.7915
Epoch 2/20
1664000/1664000 - 27s - loss: 0.5135 - accuracy: 0.7812 - val_loss: 0.4790 - val_accuracy: 0.7739
Epoch 3/20
1664000/1664000 - 26s - loss: 0.4708 - accuracy: 0.7875 - val_loss: 0.4627 - val_accuracy: 0.7929
Epoch 4/20
1664000/1664000 - 27s - loss: 0.4527 - accuracy: 0.7918 - val_loss: 0.4369 - val_accuracy: 0.7968
Epoch 5/20
1664000/1664000 - 26s - loss: 0.4416 - accuracy: 0.7943 - val_loss: 0.4223 - val_accuracy: 0.8041
Epoch 6/20
1664000/1664000 - 26s - loss: 0.4329 - accuracy: 0.7960 - val_loss: 0.4820 - val_accuracy: 0.7583
Epoch 7/20
1664000/1664000 - 26s - loss: 0.4256 - accuracy: 0.7978 - val_loss: 0.4173 - val_accuracy: 0.7958
Epoch 8/20
1664000/1664000 - 26s - loss: 0.4127 - accuracy: 0.8001 - val_loss: 0.4005 - val_accuracy: 0.8026
Epoch 9/20
1664000/1664000 - 26s - loss: 0.4085 - accuracy: 0.8006 - val_lo

In [11]:
print(model1.evaluate(x_test1, y_test1))

[0.35193492987201763, 0.81815386]


In [12]:
a = model1.predict_classes(x_test1[:100])
b = y_test1[:100]

for idx, val in enumerate(a):
    if b[idx][val] == 0:
        print(x_test1[idx], a[idx],b[idx])

[ 0.29990629 -0.33495956 -0.13679329  0.66438622  0.6958575 ] 0 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.17151853  0.09877017 -0.28130324] 9 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.17023241  0.59846856 -0.72546721] 3 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.17151853  1.05563942  0.6958575 ] 9 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.17023241 -0.08409818 -0.54780162] 0 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.16958935 -0.21168074  0.34052632] 0 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.17072366  0.24974287 -0.9031328 ] 0 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[-0.82474231 -0.2166909  -0.17087547 -1.00481903 -0.54780162] 8 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[-0.82474231 -0.15676661 -0.17151853 -1.46836902  0.25169353] 9 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0.29990629 -0.38753542 -0.13679329 -1.28975343 -0.8143    ] 7 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[ 1.17463298 -0.34097006 -0.12393209 -1.

In [13]:
model1_2 = build_model2(5,10)
history1_2 = model1_2.fit(x_train1, y_train1, epochs=10, validation_split=0.2, batch_size=128, verbose=2)

Train on 1664000 samples, validate on 416000 samples
Epoch 1/10
1664000/1664000 - 15s - loss: 1.4354 - accuracy: 0.4673 - val_loss: 1.1788 - val_accuracy: 0.5643
Epoch 2/10
1664000/1664000 - 14s - loss: 1.0991 - accuracy: 0.6031 - val_loss: 1.0573 - val_accuracy: 0.6286
Epoch 3/10
1664000/1664000 - 14s - loss: 1.0345 - accuracy: 0.6335 - val_loss: 1.0144 - val_accuracy: 0.6443
Epoch 4/10
1664000/1664000 - 14s - loss: 0.9969 - accuracy: 0.6462 - val_loss: 0.9842 - val_accuracy: 0.6501
Epoch 5/10
1664000/1664000 - 14s - loss: 0.9660 - accuracy: 0.6513 - val_loss: 0.9557 - val_accuracy: 0.6552
Epoch 6/10
1664000/1664000 - 14s - loss: 0.9427 - accuracy: 0.6554 - val_loss: 0.9376 - val_accuracy: 0.6523
Epoch 7/10
1664000/1664000 - 14s - loss: 0.9265 - accuracy: 0.6586 - val_loss: 0.9192 - val_accuracy: 0.6683
Epoch 8/10
1664000/1664000 - 14s - loss: 0.9155 - accuracy: 0.6626 - val_loss: 0.9131 - val_accuracy: 0.6582
Epoch 9/10
1664000/1664000 - 15s - loss: 0.9063 - accuracy: 0.6673 - val_lo

In [14]:
model2 = build_model(4,7)
history2 = model2.fit(x_train2, y_train2, epochs=15, validation_split=0.2, batch_size=128, verbose=2)

Train on 322560 samples, validate on 80640 samples
Epoch 1/15
322560/322560 - 5s - loss: 0.2283 - accuracy: 0.9166 - val_loss: 0.1406 - val_accuracy: 0.9609
Epoch 2/15
322560/322560 - 5s - loss: 0.1545 - accuracy: 0.9473 - val_loss: 0.1396 - val_accuracy: 0.9550
Epoch 3/15
322560/322560 - 5s - loss: 0.1470 - accuracy: 0.9496 - val_loss: 0.1336 - val_accuracy: 0.9559
Epoch 4/15
322560/322560 - 6s - loss: 0.1419 - accuracy: 0.9531 - val_loss: 0.1147 - val_accuracy: 0.9670
Epoch 5/15
322560/322560 - 5s - loss: 0.1407 - accuracy: 0.9531 - val_loss: 0.1155 - val_accuracy: 0.9677
Epoch 6/15
322560/322560 - 5s - loss: 0.1409 - accuracy: 0.9534 - val_loss: 0.1202 - val_accuracy: 0.9645
Epoch 7/15
322560/322560 - 5s - loss: 0.1370 - accuracy: 0.9548 - val_loss: 0.1245 - val_accuracy: 0.9624
Epoch 8/15
322560/322560 - 5s - loss: 0.1323 - accuracy: 0.9570 - val_loss: 0.1320 - val_accuracy: 0.9601
Epoch 9/15
322560/322560 - 5s - loss: 0.1323 - accuracy: 0.9573 - val_loss: 0.1197 - val_accuracy: 0.

In [15]:
model2_2 = build_model2(4,7)
history2_2 = model2_2.fit(x_train2, y_train2, epochs=15, validation_split=0.2, batch_size=128, verbose=2)

Train on 322560 samples, validate on 80640 samples
Epoch 1/15
322560/322560 - 3s - loss: 1.2289 - accuracy: 0.5297 - val_loss: 0.8262 - val_accuracy: 0.6320
Epoch 2/15
322560/322560 - 3s - loss: 0.6525 - accuracy: 0.7405 - val_loss: 0.5253 - val_accuracy: 0.8259
Epoch 3/15
322560/322560 - 3s - loss: 0.4474 - accuracy: 0.8823 - val_loss: 0.3788 - val_accuracy: 0.9137
Epoch 4/15
322560/322560 - 3s - loss: 0.3311 - accuracy: 0.9189 - val_loss: 0.2885 - val_accuracy: 0.9445
Epoch 5/15
322560/322560 - 3s - loss: 0.2608 - accuracy: 0.9423 - val_loss: 0.2367 - val_accuracy: 0.9439
Epoch 6/15
322560/322560 - 3s - loss: 0.2218 - accuracy: 0.9452 - val_loss: 0.2073 - val_accuracy: 0.9492
Epoch 7/15
322560/322560 - 3s - loss: 0.1993 - accuracy: 0.9480 - val_loss: 0.1920 - val_accuracy: 0.9602
Epoch 8/15
322560/322560 - 3s - loss: 0.1847 - accuracy: 0.9498 - val_loss: 0.1808 - val_accuracy: 0.9608
Epoch 9/15
322560/322560 - 3s - loss: 0.1750 - accuracy: 0.9510 - val_loss: 0.1706 - val_accuracy: 0.

## test

In [16]:
print(model1.evaluate(x_test1, y_test1))
print(model1_2.evaluate(x_test1, y_test1))
# [0.09895923781607832, 0.85723215]

[0.35193492987201763, 0.81815386]
[0.895455308165917, 0.67021346]


In [17]:
print(model2.evaluate(x_test2, y_test2))
print(model2_2.evaluate(x_test2, y_test2))

[0.10592036699639663, 0.9703274]
[0.14590415938328655, 0.95996034]


In [18]:
#Only save the models if they need to be updated. Be sure to also save the new test-train data

#save_model(model1, "model1.h5")
#save_model(model2, "model2.h5")
#save_model(model1_2, "model1_2.h5")
#save_model(model2_2, "model2_2.h5")

model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model1.h5
model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model2.h5
model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model1_2.h5
model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model2_2.h5
