In [371]:
!pip install fse
!pip install gensim==4.0.0
!pip install xgboost

In [372]:
# import dependencies

import pandas as pd
import numpy as np

import tensorflow as tf

print(tf.__version__)

print("--> Checking for physical Tensorflow devices")
for gpu_device in tf.config.list_physical_devices('GPU'):
    print(": {}".format(gpu_device.name))
    tf.config.experimental.set_memory_growth(gpu_device, True)

import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from tensorflow import keras
from keras import models
from tensorflow.keras import layers
from keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow_addons.optimizers import AdamW
from tensorflow.keras import optimizers as opt

from tensorflow.keras.applications import DenseNet121

import gzip
from gensim.models import KeyedVectors
import gensim.downloader as api
from fse import Vectors, Average, IndexedList

In [373]:
# data loading functions

def load_aggregated_data_pandas():
    X = pd.read_pickle('../input/mbti-datasets/mbti_aggregated.pkl')
    return X

def load_filtered_data_pandas():
    X = pd.read_pickle('../input/mbti-datasets/mbti_filtered.pkl')
    return X

def load_preprocessed_data_pandas():
    X = pd.read_pickle('../input/mbti-datasets/mbti_preprocess.pkl')
    return X

def load_word_vectors(limit = 200000):
    word_vectors = KeyedVectors.load_word2vec_format('../input/mbti-datasets/GoogleNews-vectors-negative300.bin', binary = True, limit = limit)
    return word_vectors

In [374]:
X = load_filtered_data_pandas()

In [375]:
word_vectors = load_word_vectors()

model = Average(word_vectors)

model.train(IndexedList(X['tokens'].values.tolist()))

In [376]:
model_embeddings = pd.DataFrame(model.sv.vectors, columns = ["dim{}".format(i) for i in range(model.sv.vectors.shape[1])])

model_embeddings.to_pickle("./word_vectors.pkl", protocol = 4)

In [377]:
model_pkl = pd.read_pickle("./word_vectors.pkl")
model_pkl.shape

In [378]:
X = pd.concat([X, model_pkl], axis = 1)

In [379]:
# X.drop(['id', 'author', 'body', 'subreddit', 'link_id', 'parent_id', 'body_preprocess', 'tokens'], axis = 1, inplace = True)
X.drop(['tokens'], axis = 1, inplace = True)

In [380]:
X_test_nan = X[X['author_flair_text'].isna()]
X = X[X['author_flair_text'].isna() == False]

In [381]:
y = X['author_flair_text']

In [382]:
y_list = [list(i) for i in y]
y_frame = pd.DataFrame(y_list, columns = ['e/i', 'n/s', 'f/t', 'j/p'])

In [383]:
label_encoder = LabelEncoder()
y_enc = y_frame.apply(label_encoder.fit_transform)

In [384]:
# onehot_encoder = OneHotEncoder()
# y = onehot_encoder.fit_transform(y)

In [385]:
X.drop(['author_flair_text', 'splitting'], axis = 1, inplace = True)
X_test_nan.drop('author_flair_text', axis = 1, inplace = True)

In [386]:
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size = 0.3, random_state = 42, shuffle = True)

In [387]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [388]:
X_train.shape, y_train.shape

In [389]:
# xgbmodel = XGBClassifier(n_estimators = 20, learning_rate = 0.05, verbosity = 3, booster = 'gblinear', eval_metric = 'mlogloss', max_cat_to_onehot = 17)

In [390]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [391]:
# xgbmodel.fit(X_train, y_train.ravel())

In [392]:
# y_pred = xgbmodel.predict_proba(y_test)

In [393]:
# TODO: remove magic number
# y_train = to_categorical(y_train, 16)
# y_test = to_categorical(y_test, 16)

In [394]:
# custom params
epochs = 200
batch_size = 256
initial_learning_rate = 1e-6
# hidden_units = [32, 32]
step = tf.Variable(0, trainable = False)
boundaries = [int(epochs * 0.75)]
values = [initial_learning_rate, initial_learning_rate * 0.1]

In [395]:
# initialize learning rates and optimizers
learning_rate_fn = opt.schedules.PiecewiseConstantDecay(boundaries, values)
opt_adam = AdamW(learning_rate = learning_rate_fn(step), epsilon = 1e-8, weight_decay = 5e-4)

In [396]:
# initialize adam-based network
bow_inputs = layers.Input(shape = 16)

merged_layer_1 = layers.Dense(16, activation = 'relu')(bow_inputs)
dropout_1_1 = layers.Dropout(0.5)(merged_layer_1)
batch_norm_1 = layers.BatchNormalization()(dropout_1_1)

merged_layer_2 = layers.Dense(256, activation = 'relu')(batch_norm_1)
dropout_1_2 = layers.Dropout(0.5)(merged_layer_2)
batch_norm_2 = layers.BatchNormalization()(dropout_1_2)

merged_layer_3 = layers.Dense(1024, activation = 'relu')(batch_norm_2)
dropout_1_3 = layers.Dropout(0.5)(merged_layer_3)
batch_norm_3 = layers.BatchNormalization()(dropout_1_3)

# merged_layer_4 = layers.Dense(512, activation = 'relu')(batch_norm_3)
# dropout_1_4 = layers.Dropout(0.2)(merged_layer_4)
# batch_norm_4 = layers.BatchNormalization()(dropout_1_4)

# merged_layer_5 = layers.Dense(256, activation = 'relu')(batch_norm_4)
# dropout_1_5 = layers.Dropout(0.2)(merged_layer_5)
# batch_norm_5 = layers.BatchNormalization()(dropout_1_5)

# merged_layer_6 = layers.Dense(16, activation = 'relu')(batch_norm_5)
# dropout_1_6 = layers.Dropout(0.1)(merged_layer_6)
# batch_norm_6 = layers.BatchNormalization()(dropout_1_6)

predictions = layers.Dense(16, activation = 'relu')(batch_norm_3)
wide_model = keras.Model(inputs=bow_inputs, outputs=predictions)

In [397]:
# compile adam-based network
wide_model.compile(loss = 'categorical_crossentropy',
                optimizer = opt_adam,
                metrics = ['mae', 'accuracy'])

In [398]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape = X_train.shape[1])

deep_dense_1 = layers.Dense(256, activation = 'relu')(deep_inputs)
dropout_2_1 = layers.Dropout(0.3)(deep_dense_1)
embedding_1 = layers.Embedding(32, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_1)
embedding_flat_1 = layers.Flatten()(embedding_1)

# deep_dense_2 = layers.Dense(512, activation = 'relu')(embedding_flat_1)
# dropout_2_2 = layers.Dropout(0.4)(deep_dense_2)
# embedding_2 = layers.Embedding(64, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_2)
# embedding_flat_2 = layers.Flatten()(embedding_2)

# deep_dense_3 = layers.Dense(1024, activation = 'relu')(embedding_flat_2)
# dropout_2_3 = layers.Dropout(0.3)(deep_dense_3)
# embedding_3 = layers.Embedding(300, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_3)
# embedding_flat_3 = layers.Flatten()(embedding_3)

# deep_dense_4 = layers.Dense(512, activation = 'relu')(embedding_flat_3)
# dropout_2_4 = layers.Dropout(0.2)(deep_dense_4)
# embedding_4 = layers.Embedding(300, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_4)
# embedding_flat_4 = layers.Flatten()(embedding_4)

# deep_dense_5 = layers.Dense(512, activation = 'relu')(embedding_flat_4)
# dropout_2_5 = layers.Dropout(0.2)(deep_dense_5)
# embedding_5 = layers.Embedding(300, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_5)
# embedding_flat_5 = layers.Flatten()(embedding_5)

# deep_dense_6 = layers.Dense(256, activation = 'relu')(embedding_flat_5)
# dropout_2_6 = layers.Dropout(0.1)(deep_dense_6)
# embedding_6 = layers.Embedding(300, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_6)
# embedding_flat_6 = layers.Flatten()(embedding_6)

merged_layer_2_2 = layers.Dense(1024, activation = 'relu')(embedding_flat_1)
dropout_2_2 = layers.Dropout(0.3)(merged_layer_2_2)
batch_norm_2_2 = layers.BatchNormalization()(dropout_2_2)

merged_layer_2_4 = layers.Dense(512, activation = 'relu')(batch_norm_2_2)
dropout_2_4 = layers.Dropout(0.3)(merged_layer_2_4)
batch_norm_2_4 = layers.BatchNormalization()(dropout_2_4)

deep_dense_2 = layers.Dense(256, activation = 'relu')(batch_norm_2_4)
dropout_2_2 = layers.Dropout(0.2)(deep_dense_2)
embedding_2 = layers.Embedding(64, X_train.shape[1], input_length = X_train.shape[1])(dropout_2_2)
embedding_flat_2 = layers.Flatten()(embedding_2)

embed_out = layers.Dense(1, activation = 'sigmoid')(embedding_flat_2)
deep_model = keras.Model(inputs = deep_inputs, outputs = embed_out)

In [399]:
# compile adam-based network
deep_model.compile(loss = 'categorical_crossentropy',
                optimizer = opt_adam,
                metrics = ['mae', 'accuracy'])

In [400]:
# merged_out_concat = layers.concatenate([wide_model.output, deep_model.output], axis = 1)
# merged_out = layers.Dense(16, activation = 'relu')(merged_out_concat)

In [401]:
# combined_model = keras.Model([wide_model.input, deep_model.input], merged_out)

In [402]:
# combined_model.compile(loss='categorical_crossentropy',
#                        optimizer=opt_adam,
#                        metrics=['mae', 'accuracy'])

In [403]:
# parameter to control which binary class to predict from the 4 available
i = 2
y_train = y_train[:,i]
y_test = y_test[:,i]

In [404]:
deep_model.summary()

In [405]:
# train adam-based network
with tf.device('/GPU:0'):  
    history = deep_model.fit(X_train, y_train, 
                                batch_size = batch_size, 
                                epochs = epochs, 
                                validation_data = (X_test, y_test), 
                                shuffle = True,
                                verbose = True
                            )

In [406]:
y_pred = deep_model.predict(X_test)

In [407]:
accuracy = accuracy_score(y_test, y_pred.argmax(axis = 1))
accuracy

In [408]:
print(confusion_matrix(y_test, y_pred.argmax(axis = 1)))