In [None]:
import re
import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn import utils
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.multioutput import MultiOutputClassifier

import torch
import time
import torch as T
device = T.device("cuda")

import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import doc2vec
from sklearn.svm import LinearSVC

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.models import load_model
from tensorflow.keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.layers import Flatten, LSTM
from keras.models import Model

### Importing edited dataset from Github
The 'type' column is separated into 4 columns containing a letter for each aspect of the MBTI personality. 

### Removing links and symbols from the 'posts' column

In [None]:
def clear_text(data):
    data_length = []
    cleaned_text = []
    
    for sentence in data.posts:
        sentence = sentence.lower()
        # remove links from text
        sentence = re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
        # remove other symbols
        sentence= re.sub('[^0-9a-z]',' ',sentence)

        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text, data_length

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/zappocalypse/jubilant-meme/main/mbti_3_compressed.csv")
# df = df.iloc[:500 , 1:]
df = df[["type", "spell_corrected", "is_I","is_E","is_N","is_S","is_F","is_T","is_J","is_P"]]
df = df.rename({'spell_corrected': 'posts'}, axis=1)

df['cleaned_posts'], post_length = clear_text(df)
df.drop(columns=['is_E', 'is_S', 'is_F', 'is_P', 'type', 'posts'], inplace=True)

df.head()

Unnamed: 0,is_I,is_N,is_T,is_J,cleaned_posts
0,1,1,0,1,and into moments sportscaster not top ten play...
1,0,1,1,0,i m finding the lack of me in these posts very...
2,1,1,1,0,good one course to which i say i know that s m...
3,1,1,1,1,dear into i enjoyed our conversation the other...
4,0,1,1,1,you re fired that s another silly misconceptio...


### Doc2Vec


In [None]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(doc2vec.TaggedDocument(v.split(), [label]))
    return labeled

X_train, X_test, y_train, y_test = train_test_split(df.cleaned_posts, df.iloc[:,:4], random_state=0, 
                                                    test_size=0.3)
X_train_tagged = label_sentences(X_train, 'Train')
X_test_tagged = label_sentences(X_test, 'Test')
all_data = X_train_tagged + X_test_tagged

In [None]:
model_dbow = doc2vec.Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, 
                     min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

100%|██████████| 8675/8675 [00:00<00:00, 1293709.77it/s]


In [None]:
for epoch in range(50):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), 
                     total_examples=len(all_data), 
                     epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 8675/8675 [00:00<00:00, 1621461.11it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2730625.68it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2474368.39it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2801477.30it/s]
100%|██████████| 8675/8675 [00:00<00:00, 1422868.26it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2747533.58it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2896019.36it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2819058.43it/s]
100%|██████████| 8675/8675 [00:00<00:00, 3091383.79it/s]
100%|██████████| 8675/8675 [00:00<00:00, 1118145.95it/s]
100%|██████████| 8675/8675 [00:00<00:00, 1211157.29it/s]
100%|██████████| 8675/8675 [00:00<00:00, 3037955.01it/s]
100%|██████████| 8675/8675 [00:00<00:00, 3235712.51it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2800399.23it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2814696.93it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2940724.74it/s]
100%|██████████| 8675/8675 [00:00<00:00, 2528181.43it/s]
100%|██████████| 8675/8675 [00:

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

train_vectors_dbow = get_vectors(model_dbow, len(X_train_tagged), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test_tagged), 300, 'Test')

# Multilabel logistic regression

In [None]:
# predicts labels INTJ
# i.e I:1 E:0 || N:1 S:0 || T: 1 F:0 || J:1 P:0

clf = MultiOutputClassifier(estimator=LogisticRegression(n_jobs=1, C=1e5, max_iter=10000)).fit(train_vectors_dbow, y_train)
y_pred = clf.predict(test_vectors_dbow)
print(y_pred[:10])

# logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=10000, multi_class='multinomial')
# logreg.fit(train_vectors_dbow, y_train)

# logreg = logreg.fit(train_vectors_dbow, y_train)
# y_pred = logreg.predict(test_vectors_dbow)
# print(y_pred[:10])

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

[[1 1 0 0]
 [1 1 0 1]
 [1 1 0 1]
 [1 1 1 0]
 [1 1 0 0]
 [1 1 0 0]
 [1 1 0 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 0 0]]
accuracy 0.4721475220898963
              precision    recall  f1-score   support

           0       0.85      0.92      0.89      2017
           1       0.91      0.97      0.94      2264
           2       0.81      0.80      0.81      1195
           3       0.70      0.62      0.66      1054

   micro avg       0.85      0.87      0.86      6530
   macro avg       0.82      0.83      0.82      6530
weighted avg       0.84      0.87      0.85      6530
 samples avg       0.85      0.87      0.83      6530



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Trying the multilabel methods from:
with TF Keras

https://medium.com/swlh/multi-label-text-classification-with-scikit-learn-and-tensorflow-257f9ee30536

In [None]:
num_classes = y_train.shape[1]
maxlen = 200

tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df['cleaned_posts'])
max_words = len(tokenizer.word_index) + 1
# sequences = tokenizer.texts_to_sequences(df['cleaned_posts'])
# x = pad_sequences(sequences, maxlen=maxlen)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
# calculate class weights
weights = {}
count = 0
for column in df.iloc[:, :4]: 
  # from sklearn's implementation of calculating class weights
  # calculates weights of each class
  weight = df.shape[0]/(2 * np.bincount(df[column]))
  weights[count] = weight[0]
  count += 1

print(weights)

{0: 2.1698349174587293, 1: 3.6236424394319133, 2: 0.924051981252663, 3: 0.8276092348788399}


In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Basic feed forward nn
def basic_model(n_inputs, n_outputs):
  callbacks = [ReduceLROnPlateau(), ModelCheckpoint(filepath='model-simple.h5', save_best_only=True)]

  model = Sequential()
  model.add(Embedding(n_inputs, 20, input_length=maxlen))
  #model.add(Dropout(0.2))
  model.add(GlobalMaxPool1D())
  model.add(Dense(n_outputs, activation='sigmoid'))

  model.compile(optimizer="adam", loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
  return model


In [None]:
model = basic_model(max_words, 4)
history = model.fit(X_train, y_train.to_numpy(),
                    epochs=30,
                    class_weight=weights,
                    batch_size=32,
                    validation_split=0.3,
                    callbacks=callbacks)

NameError: ignored

In [None]:
# basic model without weights initialization
model = basic_model(max_words, 4)
history = model.fit(X_train, y_train.to_numpy(),
                    epochs=30,
                    batch_size=32,
                    validation_split=0.3,
                    callbacks=callbacks)

## Using PyTorch

