# XAI Neural Net Creation

## setup

In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
import os
import sys
currDir = os.path.dirname(os.path.realpath("__file__"))
rootDir = os.path.abspath(os.path.join(currDir, '..'))
sys.path.insert(1, rootDir)

In [38]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.utils import to_categorical

from src.data.make_dataset import make_dataset_1, make_dataset_2
from src.utils.file_utils import save_df, save_model

## generate artificial dataset

In [39]:
df1 = make_dataset_1()
df2 = make_dataset_2()
save_df(df1, "dataset1.csv")
save_df(df2, "dataset2.csv")

successfully generated dataset | num_rows: 2600000
successfully generated dataset | num_rows: 504000
df successfully saved | filename: dataset1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: dataset2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data


In [40]:
df1.describe()

Unnamed: 0,ei,to,td,tf,vers
count,2600000.0,2600000.0,2600000.0,2600000.0,2600000.0
mean,1.854442,267.7224,71.555,12.16666,4.5
std,3.186181,1555.065,47.02838,11.25711,2.872282
min,0.347982,1.0,0.1,1.0,0.0
25%,0.933,3.0,34.3,3.968246,2.0
50%,1.354955,5.0,68.5,9.0,4.5
75%,1.354955,55.0,102.7,17.0,7.0
max,23.17735,14400.0,260.0,60.0,9.0


In [41]:
df2.describe()

Unnamed: 0,speed,fe,tt,vers
count,504000.0,504000.0,504000.0,504000.0
mean,65.260247,0.952462,729.525361,3.0
std,192.80363,0.308689,2322.622952,2.000002
min,2.764055,0.347982,0.0,0.0
25%,7.64,0.7872,17.1675,1.0
50%,14.335,0.887243,58.465,3.0
75%,34.13,1.21,99.9925,5.0
max,1164.8569,1.8898,14397.6001,6.0


In [42]:
df1.head()

Unnamed: 0,mode,ei,to,td,tf,vers
0,Taxi/Shuttle/Private Transit,1.21,3.0,0.1,1.0,0
1,Taxi/Shuttle/Private Transit,1.21,3.0,0.2,1.0,0
2,Taxi/Shuttle/Private Transit,1.21,3.0,0.3,1.0,0
3,Taxi/Shuttle/Private Transit,1.21,3.0,0.4,1.0,0
4,Taxi/Shuttle/Private Transit,1.21,3.0,0.5,1.0,0


In [43]:
df2.head()

Unnamed: 0,mode,speed,fe,tt,vers
0,Bus,7.64,0.7872,0.0,0
1,Bus,7.64,0.7872,0.01,0
2,Bus,7.64,0.7872,0.02,0
3,Bus,7.64,0.7872,0.03,0
4,Bus,7.64,0.7872,0.04,0


In [44]:
from src.utils.file_utils import load_df
df1, df2 = load_df("dataset1.csv"), load_df("dataset2.csv")

## create neural nets

In [45]:
# def build_model():    
#     model = Sequential()
#     model.add(Dense(6, input_dim=5, activation="relu"))#hidden layer
#     model.add(Dense(10, activation="relu"))#hidden layer
#     model.add(Dense(1, activation='sigmoid'))#output layer

#     optimizer = RMSprop(0.001)
#     model.compile(loss='mse', optimizer=optimizer, metrics=['mse', 'mae', 'mape'])
#     return model

def build_model(num_features, num_classes):
    model = Sequential()
    model.add(Dense(256, input_shape=(num_features,), activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', # Cross-entropy
                optimizer='adam', metrics=['accuracy'])
    return model

def build_model2(num_features, num_classes):
    model = Sequential()
    model.add(Dense(5, input_shape=(num_features,), activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', # Cross-entropy
                optimizer='adam', metrics=['accuracy'])
    return model

## train

In [82]:
def prepare_df(df, y_column, categorical_feats):
    temp_df = df.copy()
    temp_df.sample(frac=1)
    
    y = to_categorical(
        temp_df[y_column].values)
    temp_df.drop(y_column, axis=1, inplace=True)
    features = list(temp_df.columns)
    
    numeric_col = temp_df[temp_df.columns.difference(categorical_feats)]
    scaler = StandardScaler()
    temp_df[numeric_col.columns] = scaler.fit_transform(numeric_col)
    u = scaler.mean_
    s = scaler.scale_
    
    categorical_names = {}
    for feat in categorical_feats:
        le = LabelEncoder()
        le.fit(temp_df[feat])
        temp_df[feat] = le.transform(temp_df[feat])
        categorical_names[feat] = le.classes_
    x = temp_df.values
    return x,y,features,categorical_names,u,s

categorical_feat1 = ['mode']
x1, y1, features1,cat_feat_names1,u1,s1 = prepare_df(df1, "vers", categorical_feat1)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.20)
norm_train1, norm_test1 = pd.DataFrame(x_train1, columns=features1), pd.DataFrame(x_test1, columns=features1)


#Only save the train-test if they need to be updated. Be sure to also train the NN and save

save_df(norm_train1, "norm_train1.csv")
save_df(norm_test1, "norm_test1.csv")

categorical_feat2 = ['mode']
x2, y2, features2,cat_feat_names2,u2,s2 = prepare_df(df2, "vers", categorical_feat2)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.20)
norm_train2, norm_test2 = pd.DataFrame(x_train2, columns=features2), pd.DataFrame(x_test2, columns=features2)
save_df(norm_train2, "norm_train2.csv")
save_df(norm_test2, "norm_test2.csv")

df successfully saved | filename: norm_train1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: norm_test1.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: norm_train2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data
df successfully saved | filename: norm_test2.csv, dir: C:\Users\archg\school\senoir\xai-senior-design\data


Unnamed: 0,mode,ei,to
2575025,0,12,4
2234711,0,12,6
1744234,0,14,3
1115583,3,8,22
1023922,4,16,8


In [90]:
print(features1)
print("means1:")
print(u1)
print("std1:")
print(s1)

print(features2)
print("means2:")
print(u2)
print("std2:")
print(s2)

['mode', 'ei', 'to', 'td', 'tf']
means1:
[  1.85444168  71.555       12.16665978 267.72244306]
std1:
[   3.18618066   47.02836883   11.25710397 1555.06491616]
['mode', 'speed', 'fe', 'tt']
means2:
[  0.95246209  65.2602466  729.52536085]
std2:
[3.08688256e-01 1.92803439e+02 2.32262065e+03]


In [92]:
model1 = build_model(5,10)
history1 = model1.fit(x_train1, y_train1, epochs=20, validation_split=0.2, batch_size=128, verbose=2)

Train on 1664000 samples, validate on 416000 samples
Epoch 1/20
1664000/1664000 - 28s - loss: 0.6197 - accuracy: 0.7496 - val_loss: 0.4986 - val_accuracy: 0.7964
Epoch 2/20
1664000/1664000 - 30s - loss: 0.4991 - accuracy: 0.7837 - val_loss: 0.4715 - val_accuracy: 0.7821
Epoch 3/20
1664000/1664000 - 28s - loss: 0.4600 - accuracy: 0.7919 - val_loss: 0.4236 - val_accuracy: 0.8092
Epoch 4/20
1664000/1664000 - 28s - loss: 0.4447 - accuracy: 0.7951 - val_loss: 0.4137 - val_accuracy: 0.8049
Epoch 5/20
1664000/1664000 - 28s - loss: 0.4278 - accuracy: 0.7995 - val_loss: 0.3923 - val_accuracy: 0.8103
Epoch 6/20
1664000/1664000 - 28s - loss: 0.4200 - accuracy: 0.8008 - val_loss: 0.4386 - val_accuracy: 0.7866
Epoch 7/20
1664000/1664000 - 27s - loss: 0.4098 - accuracy: 0.8028 - val_loss: 0.3888 - val_accuracy: 0.8104
Epoch 8/20
1664000/1664000 - 27s - loss: 0.4034 - accuracy: 0.8040 - val_loss: 0.3852 - val_accuracy: 0.8074
Epoch 9/20
1664000/1664000 - 28s - loss: 0.4007 - accuracy: 0.8049 - val_lo

In [93]:
print(model1.evaluate(x_test1, y_test1))

[0.4288094108260595, 0.7990577]


In [None]:
a = model1.predict_classes(x_test1[:100])
b = y_test1[:100]

for idx, val in enumerate(a):
    if b[idx][val] == 0:
        print(x_test1[idx], a[idx],b[idx])

In [94]:
model1_2 = build_model2(5,10)
history1_2 = model1_2.fit(x_train1, y_train1, epochs=10, validation_split=0.2, batch_size=128, verbose=2)

Train on 1664000 samples, validate on 416000 samples
Epoch 1/10
1664000/1664000 - 15s - loss: 1.3625 - accuracy: 0.4813 - val_loss: 1.1325 - val_accuracy: 0.5516
Epoch 2/10
1664000/1664000 - 14s - loss: 1.0856 - accuracy: 0.5844 - val_loss: 1.0350 - val_accuracy: 0.6205
Epoch 3/10
1664000/1664000 - 15s - loss: 0.9988 - accuracy: 0.6337 - val_loss: 0.9659 - val_accuracy: 0.6367
Epoch 4/10
1664000/1664000 - 15s - loss: 0.9469 - accuracy: 0.6363 - val_loss: 0.9264 - val_accuracy: 0.6456
Epoch 5/10
1664000/1664000 - 15s - loss: 0.9175 - accuracy: 0.6386 - val_loss: 0.9047 - val_accuracy: 0.6437
Epoch 6/10
1664000/1664000 - 15s - loss: 0.9022 - accuracy: 0.6421 - val_loss: 0.8953 - val_accuracy: 0.6341
Epoch 7/10
1664000/1664000 - 15s - loss: 0.8917 - accuracy: 0.6450 - val_loss: 0.8887 - val_accuracy: 0.6494
Epoch 8/10
1664000/1664000 - 15s - loss: 0.8834 - accuracy: 0.6465 - val_loss: 0.8810 - val_accuracy: 0.6390
Epoch 9/10
1664000/1664000 - 15s - loss: 0.8770 - accuracy: 0.6480 - val_lo

In [95]:
model2 = build_model(4,7)
history2 = model2.fit(x_train2, y_train2, epochs=15, validation_split=0.2, batch_size=128, verbose=2)

Train on 322560 samples, validate on 80640 samples
Epoch 1/15
322560/322560 - 6s - loss: 0.3026 - accuracy: 0.8870 - val_loss: 0.1945 - val_accuracy: 0.9385
Epoch 2/15
322560/322560 - 5s - loss: 0.1613 - accuracy: 0.9453 - val_loss: 0.1291 - val_accuracy: 0.9604
Epoch 3/15
322560/322560 - 5s - loss: 0.1605 - accuracy: 0.9448 - val_loss: 0.1247 - val_accuracy: 0.9654
Epoch 4/15
322560/322560 - 5s - loss: 0.1525 - accuracy: 0.9480 - val_loss: 0.2357 - val_accuracy: 0.8979
Epoch 5/15
322560/322560 - 5s - loss: 0.1560 - accuracy: 0.9455 - val_loss: 0.1313 - val_accuracy: 0.9642
Epoch 6/15
322560/322560 - 5s - loss: 0.1479 - accuracy: 0.9504 - val_loss: 0.1605 - val_accuracy: 0.9404
Epoch 7/15
322560/322560 - 5s - loss: 0.1499 - accuracy: 0.9491 - val_loss: 0.1167 - val_accuracy: 0.9689
Epoch 8/15
322560/322560 - 6s - loss: 0.1497 - accuracy: 0.9487 - val_loss: 0.1406 - val_accuracy: 0.9523
Epoch 9/15
322560/322560 - 5s - loss: 0.1403 - accuracy: 0.9535 - val_loss: 0.1121 - val_accuracy: 0.

In [96]:
model2_2 = build_model2(4,7)
history2_2 = model2_2.fit(x_train2, y_train2, epochs=15, validation_split=0.2, batch_size=128, verbose=2)

Train on 322560 samples, validate on 80640 samples
Epoch 1/15
322560/322560 - 3s - loss: 1.2420 - accuracy: 0.5050 - val_loss: 0.8720 - val_accuracy: 0.5995
Epoch 2/15
322560/322560 - 3s - loss: 0.7975 - accuracy: 0.6362 - val_loss: 0.7393 - val_accuracy: 0.6536
Epoch 3/15
322560/322560 - 3s - loss: 0.7021 - accuracy: 0.6830 - val_loss: 0.6617 - val_accuracy: 0.7052
Epoch 4/15
322560/322560 - 3s - loss: 0.6412 - accuracy: 0.7148 - val_loss: 0.6181 - val_accuracy: 0.7163
Epoch 5/15
322560/322560 - 3s - loss: 0.6049 - accuracy: 0.7289 - val_loss: 0.5894 - val_accuracy: 0.7298
Epoch 6/15
322560/322560 - 3s - loss: 0.5819 - accuracy: 0.7411 - val_loss: 0.5689 - val_accuracy: 0.7748
Epoch 7/15
322560/322560 - 3s - loss: 0.5661 - accuracy: 0.7614 - val_loss: 0.5562 - val_accuracy: 0.7716
Epoch 8/15
322560/322560 - 3s - loss: 0.5546 - accuracy: 0.7707 - val_loss: 0.5368 - val_accuracy: 0.7726
Epoch 9/15
322560/322560 - 3s - loss: 0.5125 - accuracy: 0.7862 - val_loss: 0.4824 - val_accuracy: 0.

## test

In [97]:
print(model1.evaluate(x_test1, y_test1))
print(model1_2.evaluate(x_test1, y_test1))
# [0.09895923781607832, 0.85723215]

[0.4288094108260595, 0.7990577]
[0.8683844898480635, 0.65446925]


In [98]:
print(model2.evaluate(x_test2, y_test2))
print(model2_2.evaluate(x_test2, y_test2))

[0.11817697667132412, 0.96347225]
[0.28554318231367876, 0.92499006]


In [99]:
#Only save the models if they need to be updated. Be sure to also save the new test-train data

save_model(model1, "model1.h5")
save_model(model2, "model2.h5")
save_model(model1_2, "model1_2.h5")
save_model(model2_2, "model2_2.h5")

model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model1.h5
model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model2.h5
model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model1_2.h5
model successfully saved | file_location: C:\Users\archg\school\senoir\xai-senior-design\models\model2_2.h5


In [100]:
print(cat_feat_names1)
print(cat_feat_names2)

{'mode': array(['Private Vehicle', 'Taxi/Shuttle/Private Transit',
       'bus/amtrak bus/transit bus', 'rail/trolly/pastco', 'school bus',
       'subway/train/pastco'], dtype=object)}
{'mode': array(['Auto', 'Bus', 'Commuter Rail', 'Ferry', 'Subway', 'Transfer Bus'],
      dtype=object)}
