In [2]:
import pandas as pd
import numpy as np

import lime
import lime.lime_tabular

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
class Network(nn.Module):
    def __init__(self, in_dim, out_dim=2):
        super().__init__()
        
        self.hidden1 = nn.Linear(in_dim, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.output = nn.Linear(16, out_dim)
        
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of our operations
        x = self.hidden1(x)
        x = self.activation(x)
        x = self.hidden2(x)
        x = self.activation(x)
        x = self.output(x)
        #print(x.squeeze().shape)
        #x = self.softmax(x)
        
        return x

In [4]:
class FakeNewsDataset(Dataset):
    def __init__(self, df):
        self.data = df.drop(columns=['target'])
        self.targets = df['target'].astype(int)
    
    def __getitem__(self, i):
        x = torch.tensor(self.data.iloc[i]).float()
        y = torch.tensor(self.targets.iloc[i]).long()
        return x, y
    
    def __len__(self):
        return len(self.data)
    

In [22]:
train_df = pd.read_csv('embeddings/inference_train_df.csv')
trn_df = torch.from_numpy(train_df.drop(columns='target').to_numpy())
test_df = pd.read_csv('embeddings/inference_test_df.csv')
tst_df = torch.from_numpy(test_df.to_numpy())

In [23]:
input_dim = 205
model_path = "models/fake-news-classifier.pt" 

model = Network(input_dim, 2)
model.load_state_dict(torch.load(model_path))

#train_dataset = FakeNewsDataset(train_df)
#val_dataset = FakeNewsDataset(test_df)

<All keys matched successfully>

In [69]:
# The prediction function needs to work on 
# multiple feature vectors (the vectors randomly perturbed from the data_row)
def preprocess(x):
    return torch.tensor(x).float()
def predict(instance):
    model.eval()
    print(instance.shape)
    batch = torch.stack(tuple(preprocess(i) for i in instance), dim=0)
    logits = model(batch)
    #probs = F.softmax(logits, dim=1)
    return logits.detach().numpy()

In [70]:
explainer = lime.lime_tabular.LimeTabularExplainer(train_df,
                                                   mode='classification',
                                                   feature_names=train_df.drop(columns='target').columns, 
                                                   class_names=['real','fake'], 
                                                   discretize_continuous=False)

In [71]:
y = np.array(test_df.drop(columns=['target']).iloc[i])
predict(y)

(205,)


array([-5.6374664,  5.0877633], dtype=float32)

In [72]:
n_feat = len(test_df.drop(columns=['target']).columns)
n_samp = 1

In [73]:
n_feat

205

In [74]:
i = np.random.randint(0, len(test_df))
y = np.array(test_df.drop(columns=['target']).iloc[i])
exp = explainer.explain_instance(y, 
                                 predict,
                                 num_features=n_feat,
                                 top_labels=1)

ValueError: operands could not be broadcast together with shapes (5000,205) (206,) 

NOTES:
- use lightGBM model and use shap directly within lightGBM [intro](https://www.kaggle.com/hmendonca/lightgbm-predictions-explained-with-shap-0-796) 
- densenet in scikitlearn
- try a bunch of different classifiers (scikitlearn - you could even try that autoML library)