# BERT 
https://colab.research.google.com/drive/1ZQvuAVwA3IjybezQOXnrXMGAnMyZRuPU#scrollTo=E_t4cM6KLc98

In [1]:
#
# * File:    Twitter_Persona_GloVe.py
# *
# * Author1:  Pavan Kumar K N (pavankumar.karkekopp@ucalgary.ca)
# * Date:     11th Aug 2019
# * Summary of File:
# * Explore mbti_1.csv file acquired from https://www.kaggle.com/datasnaek/mbti-type
# * Apply state-of-the-art reported publicly
# * Build classifier model that is better using machine learning techniques

#Just making sure the right environment is running this script
import sys
sys.executable

'C:\\ProgramData\\Anaconda3\\python.exe'

In [2]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
# % matplotlib inline

# # Load pre-trained model tokenizer (vocabulary)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [3]:
#Read Data
import numpy as np
import pandas as pd
import sklearn
import re
import pickle

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

from numpy import loadtxt
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from imblearn.over_sampling import SMOTE

from sklearn.decomposition import PCA
import pylab as pl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [4]:
def encode_types(row):
    t=row['type']

    I = 0
    N = 0
    T = 0
    J = 0
    
    if t[0] == 'I': I = 1
    elif t[0] == 'E': I = 0
    else: print('Could not identify label for I-E')
        
    if t[1] == 'N': N = 1
    elif t[1] == 'S': N = 0
    else: print('Could not identify label for N-S')
        
    if t[2] == 'T': T = 1
    elif t[2] == 'F': T = 0
    else: print('Could not identify label for T-F')
        
    if t[3] == 'J': J = 1
    elif t[3] == 'P': J = 0
    else: print('Could not identify label for J-P')
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

In [5]:
#Function to binarize the types into simple lists instead of pandas.series
personality_binary = {'I':1, 'E':0, 'N':1,'S':0, 'T':1, 'F':0, 'J':1, 'P': 0}
binary_personality = [{1:'I', 0:'E'}, 
                      {1:'N', 0:'S'},
                      {1:'T', 0:'F'},
                      {1:'J', 0:'P'}]

def translate_personality(personality):
    # transform mbti to binary vector
    return [personality_binary[l] for l in personality]


def translate_binary(personality):
    # transform binary vector to mbti personality
    s = ""
    for i, l in enumerate(personality):
        s += binary_personality[i][l]
    return s



In [8]:
def parseMBTI(mbti_file_path): 
    
    
    #List of strings to remove from the corpus
    unique_type_list = ['INFJ', 
                        'ENTP', 
                        'INTP', 
                        'INTJ', 
                        'ENTJ', 
                        'ENFJ', 
                        'INFP', 
                        'ENFP',
                        'ISFP', 
                        'ISTP', 
                        'ISFJ', 
                        'ISTJ', 
                        'ESTP', 
                        'ESFP', 
                        'ESTJ', 
                        'ESFJ']
    list_personality = []
    list_posts = []
    

    
    # Initialize for Lemmatization
    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()

    #List of unique types of personality
    unique_type_list = [x.lower() for x in unique_type_list]

    #Read file
    mbti_data = pd.read_csv(mbti_file_path)


    raw_posts = mbti_data.posts.values
    filtered_posts = [p.split("|||") for p in raw_posts]
    mbti_data_encoded = mbti_data.join(mbti_data.apply(lambda row: encode_types(row), axis=1))
    
    len_data = len(mbti_data_encoded)
    i=0
    
    
    for row in mbti_data_encoded.iterrows():
        i+=1
        tweets = []

        if (i % 500 == 0 or i == 1 or i == len_data):
            print("%s of %s rows" % (i, len_data))

        ##### Remove and clean comments
        posts = row[1].posts
        
        for tweet_string in posts.split("|||"):
            #Removing mentions
            tweet_string = tweet_string.replace('@username', '')
            #Removing unecessary spaces

            #Removing URL
            tweet_string = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", tweet_string)
            tweet_string = tweet_string.strip()
            tweets.append(tweet_string)

        j=0
        for pos in tweets:
            if pos is not None:
                pos = re.sub("[^a-zA-Z]", " ", pos)
                pos = re.sub(" +", " ", pos).lower()
                pos = " ".join([lemmatiser.lemmatize(w) for w in pos.split(' ')])

                if pos!= " ":
                    tweets[j] = pos
                else:
                    tweets[j] = None

            j += 1

        tweets = list(filter(None, tweets))

        #'Add [SEP] tokens for BERT tokenizer'
        processed_tweets = '[SEP]'.join(tweets)
        list_posts.append(processed_tweets)
        list_personality.append(translate_personality(row[1].type))
    return np.array(list_posts), np.array(list_personality)




#         temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
#         temp = re.sub("[^a-zA-Z]", " ", temp)
#         temp = re.sub(' +', ' ', temp).strip().lower()
#         if remove_stop_words:
#             temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in stopwords.words("english")])
#         else:
#             temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
            
#         if remove_mbti_profiles:
#             for t in unique_type_list:
#                 temp = temp.replace(t,"")

#         type_labelized = translate_personality(row[1].type)
#         list_personality.append(type_labelized)
#         list_posts.append(temp)

#     list_posts = np.array(list_posts)
#     list_personality = np.array(list_personality)


In [9]:
list_posts, list_personality = parseMBTI("data/mbti_1.csv")

1 of 8675 rows
500 of 8675 rows
1000 of 8675 rows
1500 of 8675 rows
2000 of 8675 rows
2500 of 8675 rows
3000 of 8675 rows
3500 of 8675 rows
4000 of 8675 rows
4500 of 8675 rows
5000 of 8675 rows
5500 of 8675 rows
6000 of 8675 rows
6500 of 8675 rows
7000 of 8675 rows
7500 of 8675 rows
8000 of 8675 rows
8500 of 8675 rows
8675 of 8675 rows


In [10]:
list_posts[0], list_personality[0]

('enfp and intj moment sportscenter not top ten play prank[SEP]what ha been the most life changing experience in your life [SEP]on repeat for most of today [SEP]may the perc experience immerse you [SEP]the last thing my infj friend posted on his facebook before committing suicide the next day rest in peace [SEP]hello enfj sorry to hear of your distress it s only natural for a relationship to not be perfection all the time in every moment of existence try to figure the hard time a time of growth a [SEP]welcome and stuff [SEP]game set match [SEP]prozac wellbrutin at least thirty minute of moving your leg and i don t mean moving them while sitting in your same desk chair weed in moderation maybe try edible a a healthier alternative [SEP]basically come up with three item you ve determined that each type or whichever type you want to do would more than likely use given each type cognitive function and whatnot when left by [SEP]all thing in moderation sims is indeed a video game and a good o

In [25]:
# Load pre-trained model (weights)
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
bert_model.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:
#Specify max_token_length according to above
max_token_length = 256
list_posts_vec = []
list_personality_vec = []
list_posts_len = len(list_posts)

count_progress = 0
for posts, personality in zip(list_posts, list_personality):
    
    count_progress += 1
    sentence_list = posts.split('[SEP]')
    running_token_len = 0
    sentence_count = 0
    tweet_stream_vec_list = []
    text_batch_list = []
    text = ""
    
    if (count_progress % 100 == 0 or count_progress == 1 or count_progress == list_posts_len):
            print("{} of {} rows".format(count_progress, list_posts_len))
            
    #Split the sequence of tweets into separate batches of max_token_length
    for sentence in sentence_list:
        
        sentence_len = len(sentence.strip().split(' '))
#         print(" \Sentence Length: {}\n  \nRunning Token Length: {}".format(sentence_len, running_token_len))
        #Case 1: Sentence is smaller than max token_length
        if(sentence_len <= max_token_length):
           
           #Concatenate into single post
            if(running_token_len + sentence_len < max_token_length):
                running_token_len += sentence_len
                text += sentence
            
            else:
                if text!= "":
                    text_batch_list.append(text)
                sentence_count += 1
                running_token_len = sentence_len
                text = sentence
           
        
        else:
            if text!= "":
                text_batch_list.append(text)
            text_len = len(sentence.strip().split(' '))
            
            while(text_len > max_token_length):
                text_batch_list.append( " ".join(sentence.strip().split(' ')[:max_token_length]))
                sentence = " ".join(sentence.strip().split(' ')[max_token_length:])
                text_len = len(sentence.strip().split(' '))
            
            running_token_len = sentence_len
            text = sentence
        
#     print("Total batches: {}\n{}".format(len(text_batch_list), [len(text.strip().split(' ')) for text in text_batch_list]))

    #Process the batches
    for text in text_batch_list:
        marked_text = "[CLS]" + text + "[SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
#         print(marked_text)
#         print((tokenized_text))
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        
#         print(tokens_tensor.shape, segments_tensors.shape)
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers, _ = bert_model(tokens_tensor, segments_tensors)
            
        token_embeddings = [] 

        batch_i = 0 #Since we have only one sentence
        # For each token in the sentence...
        for token_i in range(len(tokenized_text)):

            # Holds 12 layers of hidden states for each token 
            hidden_layers = [] 

            # For each of the 12 layers...
            for layer_i in range(len(encoded_layers)):

                # Lookup the vector for `token_i` in `layer_i`
                vec = encoded_layers[layer_i][batch_i][token_i]

                hidden_layers.append(vec)

        token_embeddings.append(hidden_layers)

        # Stores the token vectors, with shape [22 x 768]
        token_vecs_sum = []

        # For each token in the sentence...
        for token in token_embeddings:
            # Sum the vectors from the last four layers.
            sum_vec = torch.sum(torch.stack(token)[-4:], 0)

            # Use `sum_vec` to represent `token`.
            token_vecs_sum.append(sum_vec)
        
        sentence_embedding = torch.mean(encoded_layers[11], 1)
        tweet_stream_vec_list.append(sentence_embedding)
        
    #Concatenate the stream vector into one vector to represent the whole stream
    if(len(tweet_stream_vec_list) > 0):
        tweet_stream_vec = torch.mean(torch.stack(tweet_stream_vec_list), dim=0)
        list_posts_vec.append(tweet_stream_vec.numpy().ravel())
        list_personality_vec.append(personality)
    else:
        continue


1 of 8675 rows
100 of 8675 rows
200 of 8675 rows
300 of 8675 rows
400 of 8675 rows
500 of 8675 rows
600 of 8675 rows
700 of 8675 rows
800 of 8675 rows
900 of 8675 rows
1000 of 8675 rows
1100 of 8675 rows
1200 of 8675 rows
1300 of 8675 rows
1400 of 8675 rows
1500 of 8675 rows
1600 of 8675 rows
1700 of 8675 rows
1800 of 8675 rows
1900 of 8675 rows
2000 of 8675 rows
2100 of 8675 rows
2200 of 8675 rows
2300 of 8675 rows
2400 of 8675 rows
2500 of 8675 rows
2600 of 8675 rows
2700 of 8675 rows
2800 of 8675 rows
2900 of 8675 rows
3000 of 8675 rows
3100 of 8675 rows
3200 of 8675 rows
3300 of 8675 rows
3400 of 8675 rows
3500 of 8675 rows
3600 of 8675 rows
3700 of 8675 rows
3800 of 8675 rows
3900 of 8675 rows
4000 of 8675 rows
4100 of 8675 rows
4200 of 8675 rows
4300 of 8675 rows
4400 of 8675 rows
4500 of 8675 rows
4600 of 8675 rows
4700 of 8675 rows
4800 of 8675 rows
4900 of 8675 rows
5000 of 8675 rows
5100 of 8675 rows
5200 of 8675 rows
5300 of 8675 rows
5400 of 8675 rows
5500 of 8675 rows
5600

In [28]:
np.array(list_posts_vec).shape, np.array(list_personality_vec).shape

((8634, 768), (8634, 4))

In [29]:
X_train, X_test, y_train, y_test = train_test_split(np.array(list_posts_vec), np.array(list_personality_vec), test_size=0.2, random_state=42)

### Helper Functions

In [30]:
def rmse(y, y_pred):
        return np.sqrt(np.mean((y_pred - y)**2))

def deep_model(model, X_train, y_train, X_valid, y_valid):
    '''
    Function to train a multi-class model. The number of epochs and 
    batch_size are set by the constants at the top of the
    notebook. 
    
    Parameters:
        model : model with the chosen architecture
        X_train : training features
        y_train : training target
        X_valid : validation features
        Y_valid : validation target
    Output:
        model training history
    '''
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train
                       , y_train
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=1)
    return history


def eval_metric(history, metric_name):
    '''
    Function to evaluate a trained model on a chosen metric. 
    Training and validation metric are plotted in a
    line chart for each epoch.
    
    Parameters:
        history : model training history
        metric_name : loss or accuracy
    Output:
        line chart with epochs of x-axis and metric on
        y-axis
    '''
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

def test_model(model, X_train, y_train, X_test, y_test, epoch_stop):
    '''
    Function to test the model on new data after training it
    on the full training data with the optimal number of epochs.
    
    Parameters:
        model : trained model
        X_train : training features
        y_train : training target
        X_test : test features
        y_test : test target
        epochs : optimal number of epochs
    Output:
        test accuracy and test loss
    '''
    model.fit(X_train
              , y_train
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=0)
    results = model.evaluate(X_test, y_test)
    
    return results

## Classifiers

In [31]:
type_indicators = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) – Sensing (S)", 
                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"  ]

for l in range(len(type_indicators)):
    print(type_indicators[l])
    print(y_test[:,l])

IE: Introversion (I) / Extroversion (E)
[1 1 1 ... 1 1 1]
NS: Intuition (N) – Sensing (S)
[1 1 1 ... 1 0 1]
FT: Feeling (F) - Thinking (T)
[0 0 0 ... 1 1 0]
JP: Judging (J) – Perceiving (P)
[0 0 1 ... 0 0 0]


In [None]:
for l in range(len(type_indicators)):
    print("\n\n{} ...".format(type_indicators[l]))
    
    y_train_class = y_train[:,l]
    y_test_class = y_test[:,l]
    
    print(X_train.shape, y_train_class.shape)
    seed = 7    
    model = XGBClassifier(learning_rate=0.01,
                             n_estimators=100,
                             max_depth=6,
                             min_child_weight=6,
                             colsample_bytree=0.7,
                             objective='reg:logistic',
                             nthread=8,
                             scale_pos_weight=1,
                             seed=7)

    model.fit(X_train, y_train_class)
    
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    prediction_proba = [value for value in y_pred]
#     # evaluate predictions
    accuracy = sklearn.metrics.accuracy_score(y_test_class, predictions)
    f1_score_measure = sklearn.metrics.f1_score(y_test_class, predictions)
    precision_measure = sklearn.metrics.precision_score(y_test_class, predictions)
    recall_measure = sklearn.metrics.recall_score(y_test_class, predictions)
    auc_roc = sklearn.metrics.roc_auc_score(y_test_class, predictions)
#     mae = sklearn.metrics.mean_absolute_error(y_test_class, prediction_proba)
    print(" Accuracy: {:.3f} Precision: {:.3f} Recall {:.3f} F1-score {:.3f} ROC-AUC {:.3f}".format(accuracy, 
                                                                                                    precision_measure, 
                                                                                                    recall_measure,
                                                                                                   auc_roc))
    

In [54]:
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, AdaBoostRegressor, AdaBoostClassifier
def rmse(y, y_pred):
        return np.sqrt(np.mean((y_pred - y)**2))
rmse_scorer = sklearn.metrics.make_scorer(rmse, greater_is_better=False)


In [52]:
for l in range(len(type_indicators)):
    print("\n\n{} ...".format(type_indicators[l]))
    
    y_train_class = y_train[:,l]
    y_test_class = y_test[:,l]
    
    print(X_train.shape, y_train_class.shape)
    seed = 7    
    
    model = ExtraTreesClassifier(n_estimators=100)

    model.fit(X_train, y_train_class)
    
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
#     # evaluate predictions
#     accuracy = sklearn.metrics.accuracy_score(y_test_class, predictions)
#     f1_score_measure = sklearn.metrics.f1_score(y_test_class, predictions)
    rmse_val = rmse(y_test_class, predictions)
    scoring = {'acc': 'accuracy',
               'prec': 'precision_macro',
               'rec': 'recall_macro',
               'f1': sklearn.metrics.make_scorer(sklearn.metrics.f1_score),
               'roc_auc': sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score)}
    rmse_cv_5 = cross_validate(model, np.array(list_posts_vec), np.array(list_personality_vec)[:,l], cv=5, scoring=scoring)
    rmse_cv_10 = cross_validate(model,np.array(list_posts_vec), np.array(list_personality_vec)[:,l], cv=10, scoring=scoring)
    print(" RMSE test: {} RMSE cv5: {} RMSE cv10: {}".format(rmse_val, rmse_cv_5, rmse_cv_10))



IE: Introversion (I) / Extroversion (E) ...
(6907, 768) (6907,)


  'precision', 'predicted', average, warn_for)


 RMSE test: 0.4788521306805733 RMSE cv5: {'fit_time': array([2.52526259, 2.5362339 , 2.58612847, 2.56715107, 2.50232363]), 'score_time': array([0.194628  , 0.18550563, 0.18859291, 0.18550563, 0.18550563]), 'test_acc': array([0.77141204, 0.76909722, 0.76882966, 0.76825029, 0.77056779]), 'train_acc': array([1., 1., 1., 1., 1.]), 'test_prec': array([0.73591385, 0.63501742, 0.61635008, 0.6080536 , 0.67760599]), 'train_prec': array([1., 1., 1., 1., 1.]), 'test_rec': array([0.50764326, 0.50263073, 0.50490215, 0.50628519, 0.50691144]), 'train_rec': array([1., 1., 1., 1., 1.]), 'test_f1': array([0.87036429, 0.86922321, 0.86879316, 0.86824769, 0.86982249]), 'train_f1': array([1., 1., 1., 1., 1.]), 'test_roc_auc': array([0.50764326, 0.50263073, 0.50490215, 0.50628519, 0.50691144]), 'train_roc_auc': array([1., 1., 1., 1., 1.])} RMSE cv10: {'fit_time': array([2.89527583, 2.91923833, 2.95810771, 2.92124009, 2.94414306,
       2.9192121 , 2.96907949, 2.89029241, 2.9714458 , 2.93331194]), 'score_time

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 RMSE test: 0.37818326577603667 RMSE cv5: {'fit_time': array([2.69784021, 2.65193081, 2.67683196, 2.72273397, 2.67887926]), 'score_time': array([0.180516  , 0.17752552, 0.18151641, 0.18450737, 0.17952895]), 'test_acc': array([0.86111111, 0.86284722, 0.86210892, 0.86326767, 0.86210892]), 'train_acc': array([1., 1., 1., 1., 1.]), 'test_prec': array([0.43105446, 0.9313839 , 0.43130435, 0.9315942 , 0.43130435]), 'train_prec': array([1., 1., 1., 1., 1.]), 'test_rec': array([0.49932886, 0.50210084, 0.4996642 , 0.5021097 , 0.4996642 ]), 'train_rec': array([1., 1., 1., 1., 1.]), 'test_f1': array([0.92537313, 0.92632888, 0.92594897, 0.92657125, 0.92594897]), 'train_f1': array([1., 1., 1., 1., 1.]), 'test_roc_auc': array([0.49932886, 0.50210084, 0.4996642 , 0.5021097 , 0.4996642 ]), 'train_roc_auc': array([1., 1., 1., 1., 1.])} RMSE cv10: {'fit_time': array([3.29728341, 3.1528511 , 3.18051481, 3.28326988, 3.17948985,
       3.24984598, 3.27916217, 3.24469447, 3.23298216, 3.31627941]), 'score_tim

array([1, 0, 1, ..., 1, 1, 1])

In [56]:
for l in range(len(type_indicators)):
    print("\n\n{} ...".format(type_indicators[l]))
    
    y_train_class = y_train[:,l]
    y_test_class = y_test[:,l]
    
    print(X_train.shape, y_train_class.shape)
    seed = 7    
    
    model = AdaBoostClassifier(ExtraTreesClassifier(n_estimators=50), 
                              n_estimators=50, 
                              learning_rate =0.1, 
                              random_state=42)

    model.fit(X_train, y_train_class)
    
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
#     # evaluate predictions
#     accuracy = sklearn.metrics.accuracy_score(y_test_class, predictions)
#     f1_score_measure = sklearn.metrics.f1_score(y_test_class, predictions)
    scoring = {'acc': 'accuracy',
               'prec': 'precision_macro',
               'rec': 'recall_macro',
               'f1': sklearn.metrics.make_scorer(sklearn.metrics.f1_score),
               'roc_auc': sklearn.metrics.make_scorer(sklearn.metrics.roc_auc_score)}
    rmse_cv_5 = cross_validate(model, np.array(list_posts_vec), np.array(list_personality_vec)[:,l], cv=5, scoring=scoring)
    rmse_cv_10 = cross_validate(model,np.array(list_posts_vec), np.array(list_personality_vec)[:,l], cv=10, scoring=scoring)
    print(" RMSE test: {} RMSE cv5: {} RMSE cv10: {}".format(rmse_val, rmse_cv_5, rmse_cv_10))



IE: Introversion (I) / Extroversion (E) ...
(6907, 768) (6907,)
 RMSE test: 0.5884430301808192 RMSE cv5: {'fit_time': array([1.38729429, 1.41123462, 1.39530492, 1.44916201, 1.38034463]), 'score_time': array([0.09973335, 0.10175776, 0.10073161, 0.10172868, 0.09873652]), 'test_acc': array([0.76851852, 0.76851852, 0.76535342, 0.76651217, 0.76651217]), 'train_acc': array([1., 1., 1., 1., 1.]), 'test_prec': array([0.62274029, 0.62534744, 0.57968408, 0.58981318, 0.58144196]), 'train_prec': array([1., 1., 1., 1., 1.]), 'test_rec': array([0.50751597, 0.50926979, 0.50792176, 0.507795  , 0.50603545]), 'train_rec': array([1., 1., 1., 1., 1.]), 'test_f1': array([0.86833443, 0.86816084, 0.86602713, 0.86686488, 0.86704058]), 'train_f1': array([1., 1., 1., 1., 1.]), 'test_roc_auc': array([0.50751597, 0.50926979, 0.50792176, 0.507795  , 0.50603545]), 'train_roc_auc': array([1., 1., 1., 1., 1.])} RMSE cv10: {'fit_time': array([1.63164616, 1.6047461 , 1.60674191, 1.59577036, 1.61272526,
       1.62865

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 RMSE test: 0.5884430301808192 RMSE cv5: {'fit_time': array([1.48404479, 1.47309756, 1.48706031, 1.48406744, 1.46212959]), 'score_time': array([0.10471725, 0.10073113, 0.09474969, 0.10073423, 0.09886718]), 'test_acc': array([0.86111111, 0.86284722, 0.86384705, 0.86326767, 0.86095017]), 'train_acc': array([1., 1., 1., 1., 1.]), 'test_prec': array([0.5562645 , 0.73180499, 0.93184455, 0.76513832, 0.43122461]), 'train_prec': array([1., 1., 1., 1., 1.]), 'test_rec': array([0.50109413, 0.50563138, 0.50421941, 0.50388361, 0.49899261]), 'train_rec': array([1., 1., 1., 1., 1.]), 'test_f1': array([0.9253267 , 0.92623716, 0.92685963, 0.92652553, 0.9252802 ]), 'train_f1': array([1., 1., 1., 1., 1.]), 'test_roc_auc': array([0.50109413, 0.50563138, 0.50421941, 0.50388361, 0.49899261]), 'train_roc_auc': array([1., 1., 1., 1., 1.])} RMSE cv10: {'fit_time': array([1.71043587, 1.72442722, 1.70148993, 1.68852282, 1.75634456,
       1.72149014, 1.71143341, 1.70352864, 1.72240806, 1.70444608]), 'score_time

In [None]:
for l in range(len(type_indicators)):
    print("\n\n{} ...".format(type_indicators[l]))
    

In [None]:
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


# Posts in tf-idf representation

# Let's train type indicator individually
for l in range(len(type_indicators)):
    print("\n\n{} ...".format(type_indicators[l]))
    
    # Let's train type indicator individually
    y_train_class = y_train[:,l]
    y_test_class = y_test[:,l]

    # split data into train and test sets
    model = svm.LinearSVR(C=10)

    clf = model.fit(X_train, y_train_class)
    
#     pca = PCA(n_components=2).fit(X_train_seq_trunc)
    
#     pca_2d = pca.transform(X_train_seq_trunc)
    
#     svmClassifier_2d =   svm.LinearSVC(C=10,
#                           class_weight='balanced').fit(   pca_2d, y_train_class)
    
#     for i in range(0, pca_2d.shape[0]):
#         if y_train_res[i] == 0:
#             c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='r', s=50,marker='+')
#         elif y_train_res[i] == 1:
#             c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='g',    s=50,marker='o')
    
#     pl.legend([c1, c2], [type_indicators[l][0], type_indicators[l][1]])
#     x_min, x_max = pca_2d[:, 0].min() - 1,   pca_2d[:,0].max() + 1
#     y_min, y_max = pca_2d[:, 1].min() - 1,   pca_2d[:, 1].max() + 1
#     xx, yy = np.meshgrid(np.arange(x_min, x_max, .01),   np.arange(y_min, y_max, .01))
#     Z = svmClassifier_2d.predict(np.c_[xx.ravel(),  yy.ravel()])
#     Z = Z.reshape(xx.shape)
#     pl.contour(xx, yy, Z)
#     pl.title('Support Vector Machine Decision Surface')
#     pl.axis('off')
#     pl.show()
    
    # make predictions  for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
#     accuracy = sklearn.metrics.accuracy_score(y_test, predictions)
#     f1_score_measure = sklearn.metrics.f1_score(y_test, predictions)
    rmse_val = rmse(y_test_class, predictions)
    mae = sklearn.metrics.mean_absolute_error(y_test_class, predictions)
    print(" RMSE: {:.3f} MAE: {:.3f}".format(rmse_val, mae))
    