In [1]:
#
# * File:    Twitter_Persona_GloVe.py
# *
# * Author1:  Pavan Kumar K N (pavankumar.karkekopp@ucalgary.ca)
# * Date:     11th Aug 2019
# * Summary of File:
# * Explore mbti_1.csv file acquired from https://www.kaggle.com/datasnaek/mbti-type
# * Apply state-of-the-art reported publicly
# * Build classifier model that is better using machine learning techniques

#Just making sure the right environment is running this script
import sys
sys.executable

'C:\\ProgramData\\Anaconda3\\python.exe'

In [None]:
#Read Data
import numpy as np
import pandas as pd
import sklearn
import re
import pickle

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from imblearn.over_sampling import SMOTE

from sklearn.decomposition import PCA
import pylab as pl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model
from keras import models
from keras import layers
from keras import regularizers

mbti_1 = pd.read_csv('data/mbti_1.csv') 
posts = mbti_1.posts
labels = mbti_1.type

BASE_DIR = ''
GLOVE_DIR = "data/glove.twitter.27B"
MAX_SEQUENCE_LENGTH = 2000
MAX_NB_WORDS = 2000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
NB_START_EPOCHS = 20
BATCH_SIZE = 512

GLOVE_DIM = 100


In [None]:
#Read the dataset
mbti_data = pd.read_csv("data/mbti_1.csv")

In [None]:
#Print first 5 entries in the dataset
mbti_data.head()

In [None]:
raw_posts = mbti_data.posts.values

In [None]:
#Split posts by the delimiter, which is ||| as we can see in row 0 and 4 above
filtered_posts = [p.split("|||") for p in raw_posts]

In [None]:
#Example of two tweets from row 0
for i in range(5):
    print("Tweet #{}:   {}".format(i+1,filtered_posts[0][i]))

In [None]:
def encode_types(row):
    t=row['type']

    I = 0
    N = 0
    T = 0
    J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('Could not identify label for I-E')
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('Could not identify label for N-S')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('Could not identify label for T-F')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('Could not identify label for J-P')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

In [None]:
mbti_data_encoded = mbti_data.join(mbti_data.apply(lambda row: encode_types(row), axis=1))

In [None]:
mbti_data_encoded.head()

In [None]:
#Function to binarize the types into simple lists instead of pandas.series
personality_binary = {'I':1, 'E':0, 'N':1,'S':0, 'T':1, 'F':0, 'J':1, 'P': 0}
binary_personality = [{1:'I', 0:'E'}, 
                      {1:'N', 0:'S'},
                      {1:'T', 0:'F'},
                      {1:'J', 0:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    return [personality_binary[l] for l in personality]


def translate_binary(personality):
    # transform binary vector to mbti personality
    s = ""
    for i, l in enumerate(personality):
        s += binary_personality[i][l]
    return s



In [None]:
d = mbti_data_encoded.head(4)
list_personality_bin = np.array([translate_personality(p) for p in mbti_data_encoded.type])
print("Binarize MBTI list: \n%s" % list_personality_bin)

### Data Preprocessing
* Remove urls
* Keep only words and make it lowercase
* Lemmatize each word
* Remove MBTI profiles strings. Too many appear in the posts!

In [None]:
mbti_data_encoded

In [None]:
#List of strings to remove from the corpus
unique_type_list = ['INFJ', 
                    'ENTP', 
                    'INTP', 
                    'INTJ', 
                    'ENTJ', 
                    'ENFJ', 
                    'INFP', 
                    'ENFP',
                    'ISFP', 
                    'ISTP', 
                    'ISFJ', 
                    'ISTJ', 
                    'ESTP', 
                    'ESFP', 
                    'ESTJ', 
                    'ESFJ']

unique_type_list = [x.lower() for x in unique_type_list]

# Initialize for Lemmatization
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

def pre_process_data(data, remove_stop_words = True, remove_mbti_profiles = True):
    list_personality = []
    list_posts = []
    len_data = len(data)
    i=0
    
    for row in data.iterrows():
        i+=1
        if (i % 500 == 0 or i == 1 or i == len_data):
            print("%s of %s rows" % (i, len_data))

        ##### Remove and clean comments
        posts = row[1].posts
        filtered_post = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", posts)
        filtered_post_list = filtered_post.split("|||")
        j=0
        for pos in filtered_post_list:
            pos = re.sub("[^a-zA-Z]", " ", pos)
            pos = re.sub(" +", " ", pos).lower()
            if remove_stop_words:
                pos = " ".join([lemmatiser.lemmatize(w) for w in pos.split(' ') if w not in stopwords.words("english")])
            else:
                pos = " ".join([lemmatiser.lemmatize(w) for w in pos.split(' ')])

            #Removing occurrances of MBTI profile strings in tweets
            if remove_mbti_profiles:
                for t in unique_type_list:
                    pos = pos.replace(t, "")
                    
            if pos!= " ":
                filtered_post_list[j] = pos
            else:
                filtered_post_list[j] = None
    
            j += 1

            
#             print("Tweet #{}:   {}".format(j+1, pos))

        filtered_post_list = list(filter(None, filtered_post_list))
        type_labelized = translate_personality(row[1].type)
        list_personality.append(type_labelized)
        list_posts.append(" ".join(filtered_post_list))
#         print(filtered_post_list)

    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    return list_posts, list_personality


In [None]:
list_posts, list_personality  = pre_process_data(mbti_data_encoded, remove_stop_words=False)

In [None]:
list_posts[0], list_personality[0]

#### Embedding Matrix with GloVe

In [None]:
glove_file = 'glove.twitter.27B.' + str(GLOVE_DIM) + 'd.txt'
emb_dict = {}
glove = open(os.path.join(GLOVE_DIR, glove_file), encoding="utf-8")
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [None]:
airline_words = ['airplane', 'airline', 'flight', 'luggage']
for w in airline_words:
    if w in emb_dict.keys():
        print('Found the word {} in the dictionary:{}'.format(w,emb_dict[w]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(list_posts, list_personality, test_size=0.2, random_state=37)

In [None]:
X_train[0]

In [None]:
tk =  Tokenizer(num_words=MAX_NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(list_posts)

In [None]:
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [None]:
seq_len = []
for tweet in X_train:
    seq_len.append(len(tweet.split(" ")))

In [None]:
pd.Series(seq_len).describe()

In [None]:
MAX_SEQUENCE_LENGTH = 2000 #based on above

In [None]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
emb_matrix = np.zeros((MAX_NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    # The word_index contains a token for all words of the training data so we need to limit that
    if i < MAX_NB_WORDS:
        vect = emb_dict.get(w)
        # Check if the word from the training data occurs in the GloVe word embeddings
        # Otherwise the vector is kept with only zeros
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
X_train_seq_trunc[16]

In [None]:
#Splitting Vallidation data
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train, test_size=0.1, random_state=37)

assert X_valid_emb.shape[0] == y_valid_emb.shape[0]
assert X_train_emb.shape[0] == y_train_emb.shape[0]

print('Shape of validation set:',X_valid_emb.shape)

# Modelling

### Helper Functions

In [None]:
def deep_model(model, X_train, y_train, X_valid, y_valid):
    '''
    Function to train a multi-class model. The number of epochs and 
    batch_size are set by the constants at the top of the
    notebook. 
    
    Parameters:
        model : model with the chosen architecture
        X_train : training features
        y_train : training target
        X_valid : validation features
        Y_valid : validation target
    Output:
        model training history
    '''
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train
                       , y_train
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=1)
    return history


def eval_metric(history, metric_name):
    '''
    Function to evaluate a trained model on a chosen metric. 
    Training and validation metric are plotted in a
    line chart for each epoch.
    
    Parameters:
        history : model training history
        metric_name : loss or accuracy
    Output:
        line chart with epochs of x-axis and metric on
        y-axis
    '''
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

def test_model(model, X_train, y_train, X_test, y_test, epoch_stop):
    '''
    Function to test the model on new data after training it
    on the full training data with the optimal number of epochs.
    
    Parameters:
        model : trained model
        X_train : training features
        y_train : training target
        X_test : test features
        y_test : test target
        epochs : optimal number of epochs
    Output:
        test accuracy and test loss
    '''
    model.fit(X_train
              , y_train
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=0)
    results = model.evaluate(X_test, y_test)
    
    return results

### Vanilla Embedding Layer

In [None]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(MAX_NB_WORDS, 8, input_length=MAX_SEQUENCE_LENGTH))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(4, activation='softmax'))
emb_model.summary()

In [None]:
emb_history = deep_model(emb_model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)

In [None]:
eval_metric(emb_history, 'acc')

In [None]:
eval_metric(emb_history, 'loss')

In [None]:
emb_results = test_model(emb_model, X_train_seq_trunc, y_train, X_test_seq_trunc, y_test, 20)
print('/n')
print('Test accuracy of word embeddings model: {0:.2f}%'.format(emb_results[1]*100))

### GloVe Embedding with Softmax

In [None]:
emb_matrix = np.zeros((MAX_NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    # The word_index contains a token for all words of the training data so we need to limit that
    if i < MAX_NB_WORDS:
        vect = emb_dict.get(w)
        # Check if the word from the training data occurs in the GloVe word embeddings
        # Otherwise the vector is kept with only zeros
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
glove_model = models.Sequential()
glove_model.add(layers.Embedding(MAX_NB_WORDS, GLOVE_DIM, input_length=MAX_SEQUENCE_LENGTH))
glove_model.add(layers.Flatten())
glove_model.add(layers.Dense(4, activation='softmax'))
glove_model.summary()

In [None]:
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False

In [None]:
glove_history = deep_model(glove_model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)

In [None]:
eval_metric(glove_history, 'loss')

In [None]:
eval_metric(glove_history, 'acc')

In [None]:
glove_results = test_model(glove_model, X_train_seq_trunc, y_train, X_test_seq_trunc, y_test, 20)
print('/n')
print('Test accuracy of word glove model: {0:.2f}%'.format(glove_results[1]*100))

## Classifiers

In [None]:
type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) – Sensing (S)", 
                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"  ]

for l in range(len(type_indicators)):
    print(type_indicators[l])
    print(y_test[:,l])

In [None]:
for l in range(len(type_indicators)):
    print("\n\n{} ...".format(type_indicators[l]))
    
    y_train_class = y_train[:,l]
    y_test_class = y_test[:,l]
    
    print(X_train_seq_trunc.shape, y_train_class.shape)
    seed = 7
    sm = SMOTE(random_state=2)
    X_train_res , y_train_res = sm.fit_sample(X_train_seq_trunc, y_train_class.ravel())
    
    model = XGBClassifier(learning_rate=0.01,
                             n_estimators=5000,
                             max_depth=4,
                             min_child_weight=6,
                             colsample_bytree=0.8,
                             objective='binary:logistic',
                             nthread=8,
                             scale_pos_weight=1,
                             seed=7)

    model.fit(X_train_res, y_train_res)
    
    # make predictions for test data
    y_pred = model.predict(X_test_seq_trunc)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = sklearn.metrics.accuracy_score(y_test_class, predictions)
    f1_score_measure = sklearn.metrics.f1_score(y_test_class, predictions)
    rmse = sklearn.metrics.mean_squared_error(y_test_class, predictions)
    mae = sklearn.metrics.mean_absolute_error(y_test_class, predictions)
    print(" Accuracy: {:.2f}% \t F1-Score: {:.3f} RMSE: {:.3f} MAE: {:.3f}".format(accuracy * 100.0, f1_score_measure, rmse, mae))
    

In [None]:
from tabulate import tabulate
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
encoding = "utf-8"

In [None]:
X, y = np.array(list_posts), np.array(list_personality)
print ("total examples %s" % len(y))

In [None]:
with open(os.path.join(GLOVE_DIR, glove_file), "rb") as lines:
    wvec = {line.split()[0].decode(encoding): np.array(line.split()[1:],dtype=np.float32)
               for line in lines}

In [None]:
import struct 

glove_dict = {}
all_words = set(w for words in X)
with open(os.path.join(GLOVE_DIR, glove_file), "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_dict[word] = nums

In [None]:
# start with the classics - naive bayes of the multinomial and bernoulli varieties
# with either pure counts or tfidf features
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_dict))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_dict))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
etree_glove_small = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_dict)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_glove_small_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_dict)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])

In [None]:
all_models = [
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ("glove_small", etree_glove_small),
    ("glove_small_tfidf", etree_glove_small_tfidf),
]


type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) – Sensing (S)", 
                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"  ]

for l in range(len(type_indicators)):
    print(type_indicators[l])
    y = list_personality[:,l]
    unsorted_scores = [(name, cross_val_score(model, X, y, cv=5).mean()) for name, model in all_models]
    scores = sorted(unsorted_scores, key=lambda x: -x[1])
    print (tabulate(scores, floatfmt=".4f", headers=("model", 'score')))