<a href="https://colab.research.google.com/github/paruliansaragi/NLP-Microservices/blob/master/PersonalityFlask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip show tensorflow

Name: tensorflow
Version: 1.12.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: opensource@google.com
License: Apache 2.0
Location: /usr/local/lib/python3.6/dist-packages
Requires: grpcio, absl-py, wheel, six, protobuf, astor, termcolor, tensorboard, keras-preprocessing, numpy, keras-applications, gast
Required-by: stable-baselines, magenta, fancyimpute


In [0]:
!pip install tensorflow==1.12.0



In [0]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [0]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [0]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('mbti_1.csv')

In [0]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.type)

LabelEncoder()

In [0]:
labels = le.transform(df.type)

In [0]:
df['Label'] = labels

In [0]:
trn, test = train_test_split(df, test_size=0.1)

In [0]:
list_sentences_train = trn["posts"].fillna("_na_").values
y = trn["Label"].values
list_sentences_test = test["posts"].fillna("_na_").values

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [0]:
y_train = keras.utils.to_categorical(trn.Label, len(df.Label.unique()))

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip  

--2019-01-15 17:27:01--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-01-15 17:27:01--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-01-15 17:27:54 (15.7 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
path = './'
EMBEDDING_FILE=f'{path}glove.6B.50d.txt'

In [0]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [0]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
#emb_mean,emb_std

(0.020940498, 0.6441043)

In [0]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [0]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(16, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#Future idea: try drop connections as a better method of regularization than dropout.

In [0]:
model.fit(X_t, y_train, batch_size=32, epochs=5, validation_split=0.1);

Train on 7026 samples, validate on 781 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [0]:
y_test = model.predict([X_te], batch_size=1024, verbose=1); y_test



array([[0.02264689, 0.04949522, 0.02345018, ..., 0.02069343, 0.0127435 ,
        0.02279945],
       [0.02030412, 0.07793358, 0.0273341 , ..., 0.0313096 , 0.02425092,
        0.03092637],
       [0.02189407, 0.06478137, 0.02929332, ..., 0.02232615, 0.03018474,
        0.03889241],
       ...,
       [0.02691584, 0.09123602, 0.03991226, ..., 0.04305905, 0.02982636,
        0.04347808],
       [0.02677807, 0.06336863, 0.02947936, ..., 0.03010734, 0.02793651,
        0.0355185 ],
       [0.02138883, 0.07849089, 0.02865112, ..., 0.02852133, 0.0248575 ,
        0.03313944]], dtype=float32)

In [0]:
example = ['Masters, in what I will not reveal, lets keep']

In [0]:
test.head()

Unnamed: 0,type,posts,Label
8075,INFP,"'Masters, in what I will not reveal, lets keep...",9
5656,ENTJ,'I find it a lot easier to believe in God than...,2
4330,ISFP,'1: ISFP 2: Girl 3: youngest 4: I'm a visual-k...,13
4058,INTP,'One thing I find funny about rereading this t...,11
3263,INFP,'I'm with you. There are a few animals have a ...,9


In [0]:
#tokenizer.fit_on_texts(list(list_sentences_train))
#list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_example = tokenizer.texts_to_sequences(example)
#X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_example = pad_sequences(list_tokenized_example, maxlen=maxlen)

In [0]:
preds = model.predict([X_example])

In [0]:
preds.shape

(1, 16)

In [0]:
np.argmax(preds), preds

(9, array([[0.04392968, 0.15670307, 0.04526794, 0.07806492, 0.01431173,
         0.0159147 , 0.00726947, 0.01906331, 0.26070344, 0.35163456,
         0.10886131, 0.12288445, 0.0311536 , 0.04750443, 0.05205884,
         0.049218  ]], dtype=float32))

In [0]:
list_tokenized_example

[[3959, 11, 33, 1, 87, 21, 3758, 1705, 276]]

In [0]:
df.type.value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [0]:
df.Label.value_counts()

9     1832
8     1470
11    1304
10    1091
3      685
1      675
15     337
13     271
2      231
14     205
0      190
12     166
7       89
5       48
4       42
6       39
Name: Label, dtype: int64

In [0]:
model.save('personality-detect.h5')

In [0]:
from keras.models import load_model
model = load_model('personality-detect.h5')

In [0]:
preds = model.predict([X_example])

In [0]:
np.argmax(preds), preds

(9, array([[0.04392968, 0.15670307, 0.04526794, 0.07806492, 0.01431173,
         0.0159147 , 0.00726947, 0.01906331, 0.26070344, 0.35163456,
         0.10886131, 0.12288445, 0.0311536 , 0.04750443, 0.05205884,
         0.049218  ]], dtype=float32))

In [0]:
model.save_weights('pd-weights.h5')

In [0]:
model.load_weights('pd-weights.h5') 

In [0]:
preds = model.predict([X_example]); np.argmax(preds), preds

(9, array([[0.04392968, 0.15670307, 0.04526794, 0.07806492, 0.01431173,
         0.0159147 , 0.00726947, 0.01906331, 0.26070344, 0.35163456,
         0.10886131, 0.12288445, 0.0311536 , 0.04750443, 0.05205884,
         0.049218  ]], dtype=float32))

In [0]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [0]:
df.type.values

array(['INFJ', 'ENTP', 'INTP', ..., 'INTP', 'INFP', 'INFP'], dtype=object)

In [0]:
def f(x):
            return {
                9: 'INFP',
                '8':'INFJ',
                '11':'INTP',
                '10':'INTJ',
                '3':'ENTP',
                '1':'ENFP',
                '15':'ISTP',
                '13':'ISFP',
                '2':'ENTJ',
                '14':'ISTJ',
                '0':'ENFJ',
                '12':'ISFJ',
                '7':'ESTP',
                '5':'ESFP',
                '4':'ESFJ',
                '6':'ESTJ'
            }[x]

In [0]:
f(np.argmax(preds))

'INFP'