# Preprocessing

In [29]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
#data = pd.read_csv('combined_resample.csv')
data = pd.read_csv('mbti_1.csv')
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## Preprocessing for CNN/ BERT

In [6]:
df_raw = data.copy()
df_raw.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [7]:
import texthero as hero
custom_pipeline = [hero.preprocessing.fillna,
                   hero.preprocessing.lowercase,
                   hero.preprocessing.remove_whitespace,
                   hero.preprocessing.remove_urls,
                   hero.preprocessing.remove_digits,
                   hero.preprocessing.remove_punctuation,
                   hero.preprocessing.remove_diacritics
                   ]

df_raw['clean_posts_nn'] = df_raw['posts'].pipe(hero.clean, custom_pipeline)

In [8]:
##### Please use column 'clean_posts_nn' as your input variable
df_raw.head()

Unnamed: 0,type,posts,clean_posts_nn
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,i m finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one course to which i say i know t...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear intp i enjoyed our conversation the oth...
4,ENTJ,'You're fired.|||That's another silly misconce...,you re fired that s another silly misconcepti...


## Preprocessing for TF-IDF

In [16]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [24]:
def clear_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    stemmer = PorterStemmer()
    cachedStopWords = stopwords.words("english")
    cleaned_text=[]
    for sentence in data.clean_posts_nn:
        sentence=sentence.lower()
        
# #         removing links from text data
#         sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
# #         removing other symbols Keep only words
#         sentence=re.sub('[^0-9a-z]',' ',sentence)
# #          remove spaces > 1
#         sentence = re.sub(' +', ' ', sentence)
#          Remove stopwords and lematize
        sentence = " ".join([lemmatizer.lemmatize(w) for w in sentence.split(' ') if w not in cachedStopWords])
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text,data_length

In [18]:
df_pro = df_raw.copy()

In [22]:
df_pro.head()

Unnamed: 0,type,clean_posts_nn
0,INFJ,and intj moments sportscenter not top ten...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i know t...
3,INTJ,dear intp i enjoyed our conversation the oth...
4,ENTJ,you re fired that s another silly misconcepti...


In [21]:
df_pro = df_pro.drop(columns='posts')

In [25]:
df_pro.clean_posts_nn,train_length=clear_text(df_pro)
df_pro.head()

Unnamed: 0,type,clean_posts_nn
0,INFJ,intj moment sportscenter top ten play p...
1,ENTP,finding lack post alarming sex boring positio...
2,INTP,good one course say know blessing curse...
3,INTJ,dear intp enjoyed conversation day esoteric...
4,ENTJ,fired another silly misconception approachin...


# Model

## BERT

In [None]:
types = pd.unique(df_raw.type)
labels ={}
for i in range(len(types)):
    labels[types[i]] = i
labels

In [8]:
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['type']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [9]:
from sklearn.model_selection import train_test_split
X, y = df_raw['clean_posts_nn'], df_raw['type']
X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_t, y_t, test_size=0.5, random_state=0, stratify=y_t)

In [10]:
df_train = pd.DataFrame({'text':X_train, 'type':y_train})
df_val = pd.DataFrame({'text':X_val, 'type':y_val})
df_test = pd.DataFrame({'text':X_test, 'type':y_test})

In [12]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 16)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [13]:
from torch.optim import Adam
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, 'min')
    
    val_loss_list=[]
    
    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                
                train_label = train_label.type(torch.LongTensor)
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.type(torch.LongTensor)
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            val_loss_list.append(total_loss_val / len(val_data))
            if val_loss_list[-1] == np.min(val_loss_list):
                torch.save(model,'bert_best')
    scheduler.step(total_loss_val / len(val_data))

In [14]:
EPOCHS = 30
model = BertClassifier()
LR = 1e-5

#train(model, df_train.iloc[:500,:], df_val.iloc[:500,:], LR, EPOCHS)
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 868/868 [16:08<00:00,  1.12s/it]


Epochs: 1 | Train Loss:  0.270                 | Train Accuracy:  0.330                 | Val Loss:  0.206                 | Val Accuracy:  0.551


100%|██████████| 868/868 [15:57<00:00,  1.10s/it]


Epochs: 2 | Train Loss:  0.181                 | Train Accuracy:  0.600                 | Val Loss:  0.171                 | Val Accuracy:  0.624


100%|██████████| 868/868 [16:17<00:00,  1.13s/it]


Epochs: 3 | Train Loss:  0.132                 | Train Accuracy:  0.727                 | Val Loss:  0.161                 | Val Accuracy:  0.641


100%|██████████| 868/868 [16:57<00:00,  1.17s/it]


Epochs: 4 | Train Loss:  0.093                 | Train Accuracy:  0.815                 | Val Loss:  0.172                 | Val Accuracy:  0.632


100%|██████████| 868/868 [16:40<00:00,  1.15s/it]


Epochs: 5 | Train Loss:  0.062                 | Train Accuracy:  0.887                 | Val Loss:  0.182                 | Val Accuracy:  0.623


100%|██████████| 868/868 [15:59<00:00,  1.11s/it]


Epochs: 6 | Train Loss:  0.041                 | Train Accuracy:  0.929                 | Val Loss:  0.191                 | Val Accuracy:  0.634


100%|██████████| 868/868 [16:08<00:00,  1.12s/it]


Epochs: 7 | Train Loss:  0.026                 | Train Accuracy:  0.958                 | Val Loss:  0.206                 | Val Accuracy:  0.615


 64%|██████▍   | 554/868 [10:23<05:53,  1.13s/it]


KeyboardInterrupt: 

In [44]:
bert = torch.load('bert_best')

In [36]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
outputs=[]
if use_cuda:
    bert = bert.cuda()

with torch.no_grad():
    for i in range(len(df_test.text)):
        text = df_test['text'].iloc[i]
        test_input = tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt")
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)

        output = (bert(input_id, mask))
        outputs.append(output.argmax(dim=1).cpu())

In [42]:
types = {}
for i in labels:
    types[labels[i]] = i
y_pred = [types[int(i)] for i in outputs]


In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        ENFJ       0.59      0.68      0.63        19
        ENFP       0.66      0.72      0.69        68
        ENTJ       0.53      0.39      0.45        23
        ENTP       0.59      0.58      0.58        69
        ESFJ       0.00      0.00      0.00         4
        ESFP       0.00      0.00      0.00         5
        ESTJ       0.00      0.00      0.00         4
        ESTP       0.40      0.22      0.29         9
        INFJ       0.60      0.67      0.63       147
        INFP       0.64      0.66      0.65       183
        INTJ       0.69      0.64      0.67       109
        INTP       0.63      0.66      0.64       131
        ISFJ       0.53      0.62      0.57        16
        ISFP       0.58      0.52      0.55        27
        ISTJ       0.71      0.60      0.65        20
        ISTP       0.72      0.62      0.67        34

    accuracy                           0.63       868
   macro avg       0.49   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## CNN

In [11]:
## Import Packages
## Keras is now fully intregrated into Tensorflow. Try to import keras from TF
from tensorflow.keras.models import Model
import numpy as np
from tensorflow.keras.layers import Input, Dense, Embedding, Activation, Flatten, Dropout
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Concatenate, SimpleRNN,Bidirectional

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.optimizers import Adam

In [12]:
y_train_cnn = np.array(y_train.to_list()).reshape(-1,1)
y_val_cnn = np.array(y_val.to_list()).reshape(-1,1)
y_test_cnn = np.array(y_test.to_list()).reshape(-1,1)
#X_train = X_train.to_list()

In [13]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoder.fit(y_train_cnn)
encoded_y_train = encoder.transform(y_train_cnn)
encoded_y_val = encoder.transform(y_val_cnn)
encoded_y_test = encoder.transform(y_test_cnn)
encoded_y_train[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [14]:
encoder.categories_

[array(['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP',
        'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP'],
       dtype='<U4')]

In [15]:
y_train_cnn[0]

array(['ISTP'], dtype='<U4')

In [16]:
vocab_size =  1000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(X_train)

In [17]:
# Convert string to index
train_sequences = tk.texts_to_sequences(X_train)
val_sequences = tk.texts_to_sequences(X_val)
test_texts = tk.texts_to_sequences(X_test)

In [18]:
sequence_length = max([len(ele) for ele in train_sequences]) 
# Padding
train_data = pad_sequences(train_sequences, maxlen=sequence_length, padding='post')
val_data = pad_sequences(val_sequences, maxlen=sequence_length, padding='post')
test_data = pad_sequences(test_texts, maxlen=sequence_length, padding='post')

In [19]:
print("the input sequence's length is fixed to be {}".format(sequence_length))

the input sequence's length is fixed to be 1794


In [20]:
# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
val_data = np.array(val_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')
train_classes = np.array(encoded_y_train, dtype='int')
val_classes = np.array(encoded_y_val, dtype='int')
test_classes = np.array(encoded_y_test, dtype='int')

In [21]:
embedding_dim = 20
input_shape = (sequence_length,)
model_input = Input(shape=input_shape)
# Embedding Layer
# Here, we randomly initalize the input word embeddings
# The embeddings would be updated as the following layers' parameters
z_cnn = Embedding(vocab_size, embedding_dim, input_length=sequence_length, name="embedding")(model_input)
dropout = Dropout(0.2)(z_cnn)

In [22]:
# Convolutional Layer 
conv_blocks = []
#capture two-grams, 3-grams and 4 grams
filter_sizes = [2,3,4]
#for each filter, the number of filters
num_filters = 10
#loop over the different filter sizes
for sz in filter_sizes:
    # sz is the window size
    conv = Conv1D(filters=num_filters,
                  kernel_size=sz,
                  padding="valid",
                  activation="relu",
                  strides=1)(dropout)
    # Pooling Layer
    conv = GlobalMaxPooling1D()(conv)
    conv_blocks.append(conv)
# Fully-connected Layer
hiddenz = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

In [23]:
model_output = Dense(16, activation="softmax")(hiddenz)
cnn_model = Model(model_input, model_output)

lr_schedule = ExponentialDecay(
    initial_learning_rate=1e-2,
    decay_steps=12000,
    decay_rate=0.8)
optimizer = Adam(learning_rate=lr_schedule)
cnn_model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [24]:
print(cnn_model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1794)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1794, 20)     20000       input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 1794, 20)     0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 1793, 10)     410         dropout[0][0]                    
_______________________________________________________________________________________

In [25]:
# Training
cnn_model.fit(train_data, train_classes,
          validation_data=(val_data, val_classes),
          batch_size=64,
          epochs=10,
          verbose=2)

Epoch 1/10
109/109 - 10s - loss: 2.0169 - accuracy: 0.3597 - val_loss: 1.6691 - val_accuracy: 0.4844
Epoch 2/10
109/109 - 9s - loss: 1.4972 - accuracy: 0.5612 - val_loss: 1.4128 - val_accuracy: 0.5917
Epoch 3/10
109/109 - 10s - loss: 1.3040 - accuracy: 0.6164 - val_loss: 1.3077 - val_accuracy: 0.6228
Epoch 4/10
109/109 - 10s - loss: 1.1962 - accuracy: 0.6496 - val_loss: 1.2792 - val_accuracy: 0.6263
Epoch 5/10
109/109 - 10s - loss: 1.1199 - accuracy: 0.6736 - val_loss: 1.2490 - val_accuracy: 0.6424
Epoch 6/10
109/109 - 10s - loss: 1.0639 - accuracy: 0.6885 - val_loss: 1.2683 - val_accuracy: 0.6517
Epoch 7/10
109/109 - 10s - loss: 1.0150 - accuracy: 0.7043 - val_loss: 1.2225 - val_accuracy: 0.6424
Epoch 8/10
109/109 - 10s - loss: 0.9927 - accuracy: 0.7020 - val_loss: 1.2778 - val_accuracy: 0.6424
Epoch 9/10
109/109 - 10s - loss: 0.9625 - accuracy: 0.7169 - val_loss: 1.2574 - val_accuracy: 0.6667
Epoch 10/10
109/109 - 10s - loss: 0.9252 - accuracy: 0.7223 - val_loss: 1.2766 - val_accurac

<tensorflow.python.keras.callbacks.History at 0x2e6cccfe6d0>

In [26]:
y_pred = cnn_model.predict(test_data)

In [27]:
y_pred = [i[0] for i in encoder.inverse_transform(y_pred)]

In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        ENFJ       0.89      0.42      0.57        19
        ENFP       0.81      0.65      0.72        68
        ENTJ       0.73      0.48      0.58        23
        ENTP       0.47      0.62      0.53        69
        ESFJ       0.00      0.00      0.00         4
        ESFP       0.00      0.00      0.00         5
        ESTJ       0.00      0.00      0.00         4
        ESTP       0.00      0.00      0.00         9
        INFJ       0.59      0.78      0.67       147
        INFP       0.77      0.72      0.74       183
        INTJ       0.75      0.67      0.71       109
        INTP       0.69      0.78      0.73       131
        ISFJ       0.54      0.44      0.48        16
        ISFP       0.50      0.56      0.53        27
        ISTJ       0.75      0.45      0.56        20
        ISTP       0.58      0.62      0.60        34

    accuracy                           0.67       868
   macro avg       0.51   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
import shap

# we use the first 100 training examples as our background dataset to integrate over
explainer = shap.DeepExplainer(cnn_model, train_data[:100])

# explain the first 10 predictions
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(test_data[0])

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 2 dimension(s)

In [46]:
test_data[:2].shape

(2, 1794)

In [None]:
# init the JS visualization code
shap.initjs()

# transform the indexes to words
import numpy as np
words = imdb.get_word_index()
num2word = {}
for w in words.keys():
    num2word[words[w]] = w
x_test_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), x_test[i]))) for i in range(10)])

# plot the explanation of the first prediction
# Note the model is "multi-output" because it is rank-2 but only has one column
shap.force_plot(explainer.expected_value[0], shap_values[0][0], x_test_words[0])