In [1]:
#https://christophm.github.io/interpretable-ml-book/feature-importance.html

"""
Testing Neural Networks for feature importance is difficult.
Nearly all methods are approximation. If you were doing Linear Regression you will get 
coefficients for a variable but for Neural Network, the relation is much more complex.
Class in sklearn MLPClassifier gives coefcients.(weights of neurons), but these weights are
not so meaningfull because , as we go deeper in a neural network, we do lots of nonlinear transformation.
So exact correlation of parameter and output is a much more complex relation.

One way of doing a simple heuristic appromixamation to this problem is checking our features 1 at time
to see what will they change in prediction accuracy.
The idea is at : https://christophm.github.io/interpretable-ml-book/feature-importance.html

1)Train a model
2)Calculate accuracy on test set
3)Take columns 1 by 1, shuffle values ,and calculate accuracy
4)See the change in accuracy to understand which features are important.

Since there is randomness of shuflling, this method is not perfect. But it will give you an idea.


Also for making things nicer I added a column totally random, to check how model reacts to that variable.
"""

""

''

In [2]:
import pandas as pd 
# Neural Network
import keras 
from keras.models import Sequential 
from keras.layers import Dense
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import random
random.seed(12)

titanic_url = "https://raw.githubusercontent.com/mcelikkaya/medium_articles4/main/titanic.csv"

df_orig = pd.read_csv(titanic_url)
df_orig = df_orig.drop(labels=['Age',"Cabin"], axis=1)

#Doing some simple preprocessing, for this problem, I want to keep everything som simple
def get_df_labelencode(df_original):
  encode_labels = [  'Sex', 'Embarked']
  drop_labels =['Name', 'Ticket','SibSp',	'Parch']
  minmax_labels = [  'Fare']

  df = df_orig.drop(labels=drop_labels, axis=1)

  label_procecssor = {}
  for encode_label in encode_labels:
    le = LabelEncoder()
    df[encode_label] = le.fit_transform(df[encode_label].values)
    label_procecssor[encode_label] = le
  for minmax_label in minmax_labels:
    mm = MinMaxScaler()
    df[minmax_label] = mm.fit_transform(df[minmax_label].values.reshape(-1,1))
    label_procecssor[minmax_label] = mm
  return df, label_procecssor

def load_df():
  df,_ = get_df_labelencode(df_orig)
  return df

def get_train_test():
  df = load_df()
  df["random"] = np.random.random(len(df))
  X_ddf = df.drop(['Survived','PassengerId'], axis=1)
  X = X_ddf.values
  y = df['Survived'].values
  X_train, X_test, y_train, y_test = train_test_split(     X, y, test_size=0.2, random_state=42)   
  return X_train, X_test, y_train, y_test

def get_model(inputdim):  
  model = Sequential()
  levels=[80,50,10]
  # layers
  model.add(Dense(levels[0],  activation = 'relu', input_dim = inputdim) )
  for level in levels[1:]:    
    model.add(Dense(level,  activation = 'relu'))
  
  model.add(Dense(1,  activation = 'sigmoid'))
  # summary
  model.summary()
  # Compiling the NN
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return model  

In [3]:
df_orig.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,S


In [4]:
df_orig.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [5]:
df_orig.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [6]:
#Get train test set
X_train, X_test, y_train, y_test = get_train_test()
print(X_train.shape[1])
model = get_model(X_train.shape[1])
# Train the NN
model.fit(X_train, y_train, batch_size = 64, epochs = 100)

5
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                480       
                                                                 
 dense_1 (Dense)             (None, 50)                4050      
                                                                 
 dense_2 (Dense)             (None, 10)                510       
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,051
Trainable params: 5,051
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/10

<keras.callbacks.History at 0x7f34e8abebb0>

In [7]:
#Check accuracy on test set

pred_binary = np.round( model.predict(X_test) )
print( accuracy_score(y_test, pred_binary) )

0.7988826815642458


In [8]:
feature_columns = ['Pclass', 'Sex', 'Fare', 'Embarked' ,"random"]

In [9]:
df_test_orig = pd.DataFrame(X_test, columns=feature_columns) #
df_test_orig.head(5)

Unnamed: 0,Pclass,Sex,Fare,Embarked,random
0,3.0,1.0,0.029758,0.0,0.775466
1,2.0,1.0,0.020495,2.0,0.721471
2,3.0,1.0,0.015469,2.0,0.994483
3,2.0,0.0,0.064412,2.0,0.068342
4,3.0,0.0,0.021942,0.0,0.193667


In [10]:
#copy original data, shuffle the values in 1 column
def get_df_for_column(df_test_orig,col_name):
  df_test = df_test_orig.copy()
  arr =df_test[col_name].values 
  random.shuffle(arr)
  df_test[col_name] = arr
  return df_test 

#predict the accuracy for with changed data
def predict_for_colum(df_test_orig,col_name):
  df_col = get_df_for_column(df_test_orig,col_name)  
  pred_binary = np.round(model.predict(df_col.values))
  print(col_name, accuracy_score(y_test, pred_binary) )


In [11]:
for col in feature_columns:
  predict_for_colum(df_test_orig,col)


Pclass 0.7039106145251397
Sex 0.6201117318435754
Fare 0.7988826815642458
Embarked 0.770949720670391
random 0.7988826815642458
