This notebook contains baseline model for personality prediction  using MBTI dataset. 

The Myers Briggs Type Indicator (or MBTI for short) is a personality type system that divides everyone into 16 distinct personality types across 4 axis:
* Introversion (I) – Extroversion (E)
* Intuition (N) – Sensing (S)
* Thinking (T) – Feeling (F)
* Judging (J) – Perceiving (P)

In the dataset, there are 8600 rows of data. Each row contains a person's MBTI personality class and the last 50 things that he/she posted in PersonalityCafe Forum. 
Instead of 16 classes, binary classifiers trained for 4 classes as : I-E, N-S, T-F and J-P

In [23]:
# import libraries 
import json 
import numpy as np
seed = 123
np.random.seed(seed)
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
import seaborn as sns 
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras import layers
import zipfile
from keras.preprocessing import text, sequence
from keras.layers import Dense, LSTM, Bidirectional, GRU, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from os.path import join
import pandas as pd

In [39]:
# Define constants 
data_folder = "/"
raw_data_file = "mbti_1.csv"
punc_data_file = join(data_folder,"preprocessed_data_punc_4class.json")
links_data_file = join(data_folder,"preprocessed_data_links_4class.json")
letter_data_file = join(data_folder,"preprocessed_data_letters_4class.json")
type_data_file = join(data_folder,"preprocessed_data_type_4class.json")
prep_data_file = join(data_folder,"preprocessed_data_all_4class.json")
none_data_file = join(data_folder,"preprocessed_data_none_4class.json")

n_splits = 5 # number of splits for cross-validation
shuffle_flag = True 

scoring = {'acc': 'accuracy',
           'neg_log_loss': 'neg_log_loss',
           'f1_micro': 'f1_micro'}


In [25]:
# define functions 
def read_dataset(filepath):
    with open(filepath) as fp:
        return json.load(fp)
        
plt.style.use('ggplot')

def plot_history(history):
    acc = None 
    val_acc = None 
    loss = None 
    val_loss = None 
    if "accuracy" in history.history.keys():
        acc = history.history['accuracy']
    elif "acc" in history.history.keys():
        acc = history.history['acc']
    if "val_accuracy" in history.history.keys():  
        val_acc = history.history['val_accuracy']
    if 'loss' in history.history.keys():
        loss = history.history['loss']
    if 'val_loss' in history.history.keys():
        val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    title = ""
    if acc:
        plt.plot(x, acc, 'b', label='Training acc')
        title += "Training" 
    if val_acc: 
        plt.plot(x, val_acc, 'r', label='Validation acc')
        title += " and validation"
    title += " accuracy"
    plt.title(title)
    plt.legend()
    plt.subplot(1, 2, 2)
    title = ""
    if loss: 
        plt.plot(x, loss, 'b', label='Training loss')
        title += "Training"
    if val_loss: 
        plt.plot(x, val_loss, 'r', label='Validation loss')
        title += " and validation"
    title += " loss"
    plt.title(title)
    plt.legend()


## Prepare data 

In [35]:
def prepare_data_tfidf(data_file, max_features= 5000, json_flag=False):
    """ for a given datafile reads the content and returns 
      posts and labels 
      @data_file: string, the path of the file 
      @json_flag: bool, True if given file is in json format
      @max_features: int, number of features in tf-idf
      Returns 
      @X_train_cnt: tf-idf vectors of training posts  
      @X_tes_cnt: tf-idf vectors of test posts
      @y_train: int, train labels 
      @y_test: int, test labels
    """
    if json_flag:
        dataset = read_dataset(data_file)
        posts = list(dataset['posts'])
        labels = list(dataset['types'])
        X = np.array(posts)
        y = np.array(labels)
    else:
        dataset = pd.read_csv(data_file)
        posts = list(dataset['posts'].values)
        labels = list(dataset['types'].values)
        X = np.array(dataset['posts'].values)
        y = np.array(dataset[['I-E','S-N','T-F','J-P']].values)

    print("The number of data %d " %(len(posts)))
    print("Example from dataset")
    print(posts[0][0:50]) 
    print("For type %s" %(labels[0]))
    posts = [p.lower() for p in posts]
    #return posts, labels 

    # train-test split 

    X_train, X_test, y_trains, y_tests  = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=seed)
    
    print("Number of examples in train set %d " %(len(X_train)))
    print("Number of examples in test set %d " %(len(X_test)))

    # create a matrix of numbers to represent posts
    tfidf2 = CountVectorizer(ngram_range=(1, 1), 
                         stop_words='english',
                         lowercase = False, 
                         max_features = max_features)

    X_train_cnt = tfidf2.fit_transform(X_train)
    X_test_cnt = tfidf2.transform(X_test)
    
    
    return X_train_cnt, X_test_cnt, y_trains, y_tests

In [41]:
def prepare_data(data_file, 
                 max_sent_len=100, 
                 json_flag=False,
                 padding_place='post',
                 max_nb_words = 50000,
                 test_size=0.2):
    """ for a given datafile reads the content and returns 
      posts and labels 
      @data_file: string, the path of the file 
      @json_flag: bool, True if given file is in json format
      @max_features: int, number of features in tf-idf
      Returns 
      @X_train_cnt: tf-idf vectors of training posts  
      @X_tes_cnt: tf-idf vectors of test posts
      @y_train: int, train labels 
      @y_test: int, test labels
    """
    if json_flag:
        dataset = read_dataset(data_file)
        posts = list(dataset['posts'])
        labels = list(dataset['types'])
    else:
        dataset = pd.read_csv(data_file)
        posts = list(dataset['posts'].values)
        labels = list(dataset['types'].values)

    print("The number of data %d " %(len(posts)))
    print("Example from dataset")
    print(posts[0][0:50]) 
    print("For type %s" %(labels[0]))
    posts = [p.lower() for p in posts]
    #return posts, labels

    # get the vocabulary 
    vocabulary = []
    for p in posts:
        for word in p.split():
            vocabulary.append(word)
        if len(p.split())> max_sent_len:
            max_sent_len = len(p.split())  
    vocabulary = set(vocabulary)
    print("The vocabulary size is  %d " %(len(vocabulary)))
    print("The maximum post length is %d " %(max_sent_len))

    tokenizer = text.Tokenizer(num_words=max_nb_words,
                               lower=True,
                               split=" ")
    tokenizer.fit_on_texts(posts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    tokenized_X = tokenizer.texts_to_sequences(posts)
    print("Example from tokenized X ")
    print(tokenized_X[0][0:50])

    padded_X = sequence.pad_sequences(tokenized_X,
                                      maxlen=max_sent_len, 
                                      padding=padding_place)
    print('Shape of data tensor after tokenization and padding:', padded_X.shape)

    vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
    print("Vocabulary Size %d " %(vocab_size))

    # train-test split for deep learning methods

    #y_binarized_labels = to_categorical(labels)

    X_train_pad, X_test_pad, y_train_pads, y_test_pads = train_test_split(padded_X,
                                                                        labels, 
                                                                        test_size=test_size, 
                                                                        random_state=seed)
    print("Number of examples in train set %d " %(len(X_train_pad)))
    print("The shape of X training tensor (%d,%d) " %(X_train_pad.shape))
    print("The shape of y training tensor (%d) " %(len(y_train_pads)))

    print("Number of examples in test set %d " %(len(X_test_pad)))
    print("The shape of X test tensor (%d,%d)" %(X_test_pad.shape))
    return X_train_pad, X_test_pad, y_train_pads, y_test_pads , vocab_size, tokenizer

In [27]:
def overall_accuracy(y_preds_all,y_tests):
  # reverse the numbers as letters 
  # if types start I -> 1, if starts with E->0 
  # if types start S -> 1, if starts with N->0 
  # if types start T -> 1, if starts with F->0 
  # if types start J -> 1, if starts with P->0 
  predictions = []
  true_y = []
  model_names = ['IE','SN','TF', 'JP']

  # y_tests = [y_test_ie, y_test_sn, y_test_tf, y_test_jp]

  for i in range(4):
    y_preds_sub = y_preds_all[i]
    y_preds_letter = np.empty(len(y_preds_sub), dtype=object)
    
    y_preds_letter[y_preds_sub>=0.5] = model_names[i][0]
    y_preds_letter[y_preds_sub<=0.5] = model_names[i][1]

    y_true_letter = np.empty(len(y_tests[:,i]),dtype=object) 
    y_true_letter[y_tests[:,i]==1] = model_names[i][0]
    y_true_letter[y_tests[:,i]==0] = model_names[i][1]

    predictions.append(y_preds_letter)
    true_y.append(y_true_letter)

  # concate the class names 
  predictions = pd.DataFrame(predictions).transpose()
  predictions['type'] = predictions.agg(lambda x: f"{x[0]}{x[1]}{x[2]}{x[3]}", axis=1)
  true_y = pd.DataFrame(true_y).transpose()
  true_y['type'] = true_y.agg(lambda x: f"{x[0]}{x[1]}{x[2]}{x[3]}", axis=1)
  return sum(true_y['type'] == predictions['type'])/len(true_y)

In [31]:
X_train_cnt, X_test_cnt, y_trains, y_tests = prepare_data_tfidf(letter_data_file)

The number of data 8675 
Example from dataset
  http   www youtube com watch v qsxhcwe krw http 
For type INFJ
Number of examples in train set 6940 
Number of examples in test set 1735 


In [42]:
X_train,X_test,y_train,y_test,vocab_size,tokenizer =  prepare_data(letter_data_file,json_flag=False)

The number of data 8675 
Example from dataset
  http   www youtube com watch v qsxhcwe krw http 
For type INFJ
The vocabulary size is  121112 
The maximum post length is 956 
Found 121112 unique tokens.
Example from tokenized X 
[11, 30, 31, 12, 26, 27, 11, 395, 451, 12, 451, 9101, 151, 84, 56, 293, 11, 30, 31, 12, 26, 27, 11809, 227, 651, 11810, 552, 1651, 243, 11, 30, 31, 12, 26, 27, 5690, 35, 1318, 143, 35, 11, 30, 31, 12, 26, 27, 11, 30, 31, 12]
Shape of data tensor after tokenization and padding: (8675, 956)
Vocabulary Size 121113 
Number of examples in train set 6940 
The shape of X training tensor (6940,956) 
The shape of y training tensor (6940) 
Number of examples in test set 1735 
The shape of X test tensor (1735,956)


## Naive Bayes 

In [58]:
# Try for the first class as I-E
# I-> 1 and E-> 0 
model = MultinomialNB()
kfolds = StratifiedKFold(n_splits=n_splits, shuffle=shuffle_flag, random_state=seed)

results_nb = cross_validate(model, X_train_cnt,y_trains[:,0], cv=kfolds, 
                          scoring=scoring, n_jobs=-1)

In [59]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_nb['test_acc']),
                                                          np.std(results_nb['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_nb['test_f1_micro']),
                                                          np.std(results_nb['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_nb['test_neg_log_loss']),
                                                          np.std(-1*results_nb['test_neg_log_loss'])))

CV Accuracy: 0.7183 (+/- 0.0116)
CV F1: 0.7183 (+/- 0.0116)
CV Logloss: 2.3598 (+/- 0.1135)


In [60]:
# fit 4 models 
model_names = ['IE','SN','TF', 'JP']

for i in range(4):
    print("Model for classes %s " %(model_names[i]))
    model = MultinomialNB()
    model.fit(X_train_cnt,y_trains[:,i])
    y_preds = model.predict(X_test_cnt)
    accuracy = balanced_accuracy_score(y_tests[:,i],y_preds)
    print("Model :", model_names[i], " Accuracy for test data {:0.4f}".format(accuracy))


Model for classes IE 
Model : IE  Accuracy for test data 0.6666
Model for classes SN 
Model : SN  Accuracy for test data 0.6770
Model for classes TF 
Model : TF  Accuracy for test data 0.7867
Model for classes JP 
Model : JP  Accuracy for test data 0.6458


## Logistic Regression

In [61]:
# Test for IE class
# I-> 1 and E->0
model_lr = LogisticRegression(solver = 'sag',multi_class='multinomial',max_iter=400,class_weight="balanced", C=0.005)
kfolds = StratifiedKFold(n_splits=n_splits, shuffle=shuffle_flag, random_state=seed)

results_logrec = cross_validate(model_lr, X_train_cnt,y_trains[:,0], cv=kfolds, 
                          scoring=scoring, n_jobs=-1)

In [62]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_logrec['test_acc']),
                                                          np.std(results_logrec['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_logrec['test_f1_micro']),
                                                          np.std(results_logrec['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_logrec['test_neg_log_loss']),
                                                          np.std(-1*results_logrec['test_neg_log_loss'])))

CV Accuracy: 0.7303 (+/- 0.0085)
CV F1: 0.7303 (+/- 0.0085)
CV Logloss: 0.5591 (+/- 0.0186)


In [63]:
# fit 4 models 
model_names = ['IE','SN','TF', 'JP']

for i in range(4):
    print("Model for classes %s " %(model_names[i]))
    model = LogisticRegression(solver = 'sag',multi_class='multinomial',max_iter=400,class_weight="balanced", C=0.005) 

    model.fit(X_train_cnt,y_trains[:,i])
    y_preds = model.predict(X_test_cnt)
    accuracy = balanced_accuracy_score(y_tests[:,i],y_preds)
    print("Model :", model_names[i], " Accuracy for test data {:0.4f}".format(accuracy))



Model for classes IE 
Model : IE  Accuracy for test data 0.6769
Model for classes SN 
Model : SN  Accuracy for test data 0.6556
Model for classes TF 
Model : TF  Accuracy for test data 0.7989
Model for classes JP 
Model : JP  Accuracy for test data 0.6558


## Find overall Accuracy

In [127]:
# fit 4 models 
model_names = ['IE','SN','TF', 'JP']
y_preds_all = []

model = LogisticRegression(solver = 'sag',multi_class='multinomial',max_iter=400,class_weight="balanced", C=0.005) 
for i in range(4):
    model.fit(X_train_cnt,y_trains[:,i])
    y_preds = model.predict(X_test_cnt)
    y_preds_all.append(y_preds)
    accuracy = balanced_accuracy_score(y_tests[:,i],y_preds)
    print("Model :", model_names[i], " Accuracy for test data {:0.4f}".format(accuracy))


Model : IE  Accuracy for test data 0.6769
Model : SN  Accuracy for test data 0.6556
Model : TF  Accuracy for test data 0.7989
Model : JP  Accuracy for test data 0.6558


In [128]:
overall_accuracy(y_preds_all,y_tests)

0.3273775216138329

## Deep Learning with Tfidf 

In [94]:
# define baseline model
def baseline_model(input_dim,output_dim,hidden_dim):
	# create model
	model = Sequential()
	model.add(Dense(hidden_dim, input_dim=input_dim, activation='relu'))
	model.add(Dense(output_dim, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [95]:
input_dim = X_train_cnt.shape[1]
output_dim = 1 
hidden_dim = 50 
model = baseline_model(input_dim,output_dim,hidden_dim)

history = model.fit(X_train_cnt, y_trains[:,0],
                    epochs=10,
                    verbose=False,
                    batch_size=10,
                    validation_split=0.1)
loss, accuracy = model.evaluate(X_train_cnt,  y_trains[:,0], verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_cnt,  y_tests[:,0], verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


Training Accuracy: 0.9764
Testing Accuracy:  0.7620


In [130]:
# fit 4 models 
model_names = ['IE','SN','TF', 'JP']
y_preds_all = []
input_dim = X_train_cnt.shape[1]
output_dim = 1 
hidden_dim = 50 

for i in range(4):
    #print("Predicting for model %s " %(model_names[i]))
    model = baseline_model(input_dim,output_dim,hidden_dim)
    model.fit(X_train_cnt,y_trains[:,i],
                    epochs=10,
                    verbose=False,
                    batch_size=50)
    y_preds = model.predict(X_test_cnt)
    y_preds = y_preds.reshape((1735,))
    y_preds_all.append(y_preds)
    loss, accuracy = model.evaluate(X_test_cnt, y_tests[:,i], verbose=False)
    print("Model :", model_names[i], " Accuracy for test data {:0.4f}".format(accuracy))


Model : IE  Accuracy for test data 0.7758
Model : SN  Accuracy for test data 0.8571
Model : TF  Accuracy for test data 0.7873
Model : JP  Accuracy for test data 0.6605


In [131]:
overall_accuracy(y_preds_all,y_tests)

0.35504322766570606

# Using Word Embeddings 


In [14]:
# get the embeddings 
! wget http://nlp.stanford.edu/data/glove.6B.zip
folder = 'glove_folder'
with zipfile.ZipFile("glove.6B.zip","r") as zip_ref:
    zip_ref.extractall(folder)

--2020-06-23 09:11:35--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-06-23 09:11:35--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-06-23 09:11:35--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [15]:
# from https://realpython.com/python-keras-text-classification/ 
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [43]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix(
     'glove_folder/glove.6B.50d.txt',
     tokenizer.word_index, embedding_dim)

In [148]:
def simple_embedding_model(input_dim, hidden_dim, input_length, output_dim):
  model = Sequential()
  model.add(layers.Embedding(input_dim=input_dim, 
                            output_dim=hidden_dim, 
                            input_length=input_length))
  model.add(layers.Flatten())
  model.add(layers.Dense(hidden_dim, activation='relu'))
  model.add(Dense(output_dim, activation='softmax'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  #model.summary()
  return model 

In [150]:
# fit 4 models 
embedding_dim = 50

model_names = ['IE','SN','TF', 'JP']
y_preds_all = []

for i in range(4):
    model = simple_embedding_model(input_dim=vocab_size,
                               hidden_dim=50,
                               input_length = X_train.shape[1],
                               output_dim = 1)
    model.fit(X_train,y_trains[:,i],
                    epochs=5,
                    verbose=False,
                    batch_size=50,
              validation_split=0.1)
    y_preds = model.predict(X_test)
    y_preds = y_preds.reshape((1735,))
    y_preds_all.append(y_preds)
    loss, accuracy = model.evaluate(X_test, y_tests[:,i], verbose=False)
    print("Model :", model_names[i], " Accuracy for test data {:0.4f}".format(accuracy))


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Model : IE  Accuracy for test data 0.7625
Model : SN  Accuracy for test data 0.1262
Model : TF  Accuracy for test data 0.4697
Model : JP  Accuracy for test data 0.3671


In [151]:
overall_accuracy(y_preds_all,y_tests)

0.022478386167146973

## LSTM + Glove Embedding 


In [44]:
def lstm_model_embedding(input_dim,
              hidden_dim,
              embedding_matrix, 
              input_length,
              output_dim, 
              dropout = 0.2,
              learning_rate=0.01):
  model = Sequential()
  model.add(Embedding(input_dim,
                      hidden_dim, 
                      weights=[embedding_matrix], 
                      input_length=input_length,
                      mask_zero=True, 
                      trainable=False))
  model.add(LSTM(hidden_dim, 
                dropout=dropout, 
                recurrent_dropout=dropout, 
                activation='sigmoid',
                kernel_initializer='zeros'))
  model.add(Dense(output_dim, activation='sigmoid'))
  optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
hidden_dim = 50 

model_names = ['IE','SN','TF', 'JP']
y_preds_all = []

for i in range(4):
    model = lstm_model_embedding(vocab_size,
                  hidden_dim,
                  embedding_matrix,
                  X_train.shape[1],
                  1)

    model.fit(X_train,y_trains[:,i],
                    epochs=10,
                    verbose=False,
                    batch_size=50,
              validation_split=0.1)
    y_preds = model.predict(X_test)
    y_preds = y_preds.reshape((1735,))
    y_preds_all.append(y_preds)
    loss, accuracy = model.evaluate(X_test, y_tests[:,i], verbose=False)
    print("Model :", model_names[i], " Accuracy for test data {:0.4f}".format(accuracy))


Model : IE  Accuracy for test data 0.7620
Model : SN  Accuracy for test data 0.8738


## Bert Model

In [4]:
# install ktrain
!pip3 install ktrain



In [5]:
import ktrain
from ktrain import text

In [8]:
def prepare_data_bert(data_file,col_name,json_flag=False):
    if json_flag:
        dataset = read_dataset(data_file)
        posts = list(dataset['posts'])
        labels = list(dataset['types'])
        print("Not implemented for dataframe")
        return 
    else:
        dataset = pd.read_csv(data_file)

    #return posts, labels
    (X_train, y_train), (X_test, y_test), preproc = ktrain.text.texts_from_df(dataset, 
                                                                   'posts', 
                                                                   label_columns=[col_name],
                                                                   maxlen=500, 
                                                                   preprocess_mode='bert')
    return (X_train, y_train), (X_test, y_test), preproc 

In [10]:
train,test,preproc = prepare_data_bert(letter_data_file,'I-E')

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [11]:
model = text.text_classifier('bert', train, preproc=preproc)
learner = ktrain.get_learner(model,train_data=train, batch_size=3)

Is Multi-Label? False
maxlen is 500
done.


In [13]:
learner.fit_onecycle(2e-5, 1)



begin training using onecycle policy with max lr of 2e-05...
Train on 7807 samples
  75/7807 [..............................] - ETA: 13:29:52 - loss: 0.7869 - accuracy: 0.7361

KeyboardInterrupt: ignored