Цель данной работы - при помощи комментария, отзыва или любого другого текста, написанного пользователем, определять социотип по типологии Майерс-Бриггс. Это поможет лучше понимать пользователей и возможно поможет как дополнительный признак настроить рекомендательную систему.

Источник  https://habr.com/ru/company/surfingbird/blog/230103/

kaggle https://www.kaggle.com/datasnaek/mbti-type?select=mbti_1.csv

In [96]:
import pandas as pd

import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools


comments = pd.read_csv("mbti_1.csv")
print(comments.shape)
comments['posts'][0]

(8675, 2)


"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

Как мы видим в постах много ссылок, лишних знаков, данные не очень чистые

In [97]:
comments.head(3)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...


In [98]:
#from gensim.test.utils import common_texts
#from gensim.corpora.dictionary import Dictionary

#предобработка текстов
import re
import numpy as np
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel
#!pip install razdel

import pymorphy2  # pip install pymorphy2

In [99]:
stopword_eng = stopwords.words('english')
print(len(stopword_eng))

morph = pymorphy2.MorphAnalyzer()

179


Дополнительно скачала стоп слова из свободных источников

In [100]:
with open('stop_words_english.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_eng += additional_stopwords
len(stopword_eng)

1030

Также убрала сами названия типов из текста

In [101]:
list_of_types = ['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP']
for type in list_of_types:
    stopword_eng.append(type.lower())

In [102]:
len(stopword_eng)

1046

In [103]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))
    text = re.sub("\|", ' ', str(text))
    text = re.sub("https?:\/\/\S+", '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_eng] # [6]
    
    return words_lem_without_stopwords

Протестируем на 1 посте

In [104]:
text = clean_text(comments['posts'][0])
text2 = lemmatization(text)
print(text2)

['moments', 'sportscenter', 'plays', 'pranks', 'lifechanging', 'experience', 'life', 'repeat', 'today', 'perc', 'experience', 'immerse', 'friend', 'posted', 'facebook', 'committing', 'suicide', 'day', 'rest', 'peace', 'hear', 'distress', 'natural', 'relationship', 'perfection', 'time', 'moment', 'existence', 'figure', 'hard', 'times', 'times', 'growth', 'stuff', 'game', 'set', 'match', 'prozac', 'wellbrutin', 'minutes', 'moving', 'legs', 'moving', 'sitting', 'desk', 'chair', 'weed', 'moderation', 'edibles', 'healthier', 'alternative', 'basically', 'items', 'determined', 'type', 'types', 'types', 'cognitive', 'functions', 'whatnot', 'left', 'moderation', 'sims', 'video', 'game', 'good', 'note', 'good', 'subjective', 'completely', 'promoting', 'death', 'sim', 'dear', 'favorite', 'video', 'games', 'growing', 'current', 'favorite', 'video', 'games', 'cool', 'appears', 'late', 'sad', 'wait', 'thought', 'confidence', 'good', 'cherish', 'time', 'solitude', 'bc', 'revel', 'time', 'workin', 'en

In [105]:
%%time
#Запускаем очистку текста
comments['posts'] = comments['posts'].apply(lambda x: clean_text(x), 1)

Wall time: 26.8 s


In [106]:
%%time
#Запускаем лемматизацию текста
comments['posts'] = comments['posts'].apply(lambda x: lemmatization(x), 1)

Wall time: 3min 22s


Посмотрим результат

In [107]:
comments['posts']

0       [moments, sportscenter, plays, pranks, lifecha...
1       [finding, lack, posts, alarming, sex, boring, ...
2       [good, blessing, curse, absolutely, positive, ...
3       [dear, enjoyed, conversation, day, esoteric, g...
4       [fired, silly, misconception, approaching, log...
                              ...                        
8670    [ixfp, cats, fi, doms, reason, websites, neo, ...
8671    [soif, thread, exists, someplace, heck, delete...
8672    [questions, purple, pill, pick, winning, lotte...
8673    [conflicted, wanting, children, honestly, mate...
8674    [long, personalitycafe, changed, bit, good, tu...
Name: posts, Length: 8675, dtype: object

Создадим текст на основе очищенных слов (делаем join)

In [108]:
comments['posts'] = comments['posts'].apply(lambda x: ' '.join(x), 1)

In [109]:
comments['posts']

0       moments sportscenter plays pranks lifechanging...
1       finding lack posts alarming sex boring positio...
2       good blessing curse absolutely positive friend...
3       dear enjoyed conversation day esoteric gabbing...
4       fired silly misconception approaching logicall...
                              ...                        
8670    ixfp cats fi doms reason websites neo nazis pe...
8671    soif thread exists someplace heck delete ooops...
8672    questions purple pill pick winning lottery num...
8673    conflicted wanting children honestly maternal ...
8674    long personalitycafe changed bit good turn doc...
Name: posts, Length: 8675, dtype: object

In [110]:
comments['type'].value_counts() # данные, к сожалению, не сбалансированы

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [111]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(comments, comments['type'], random_state=0)

#### Многоклассовая классификация с LogisticRegression

In [112]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

pipeline = Pipeline([('title_selector', FeatureSelector(column='posts')), 
                     ('title_tfidf', TfidfVectorizer()), 
                     ('clf', LogisticRegression())])

In [113]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('title_selector', FeatureSelector(column='posts')),
                ('title_tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=N...
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                   

In [114]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict(X_test)
preds[:10]

array(['INFP', 'INFP', 'INFP', 'INTP', 'INFP', 'INFP', 'INFP', 'INFJ',
       'INTJ', 'INFJ'], dtype=object)

In [115]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score

In [116]:
models_results = {
    'approach': [],
    'f1_score_micro': [],
    'f1_score_macro': [],
    'f1_score_weighted': [],
    'precision_score_micro': [],
    'precision_score_macro': [],
    'precision_score_weighted': []
}

In [117]:
models_results['approach'].append('Multi_LogReg')
models_results['f1_score_micro'].append(f1_score(y_test, preds, average='micro'))
models_results['f1_score_macro'].append(f1_score(y_test, preds, average='macro'))
models_results['f1_score_weighted'].append(f1_score(y_test, preds, average='weighted'))
models_results['precision_score_micro'].append(precision_score(y_test, preds, average='micro'))
models_results['precision_score_macro'].append(precision_score(y_test, preds, average='macro'))
models_results['precision_score_weighted'].append(precision_score(y_test, preds, average='weighted'))

print(models_results)

{'approach': ['Multi_LogReg'], 'f1_score_micro': [0.4734900875979714], 'f1_score_macro': [0.1888899973765767], 'f1_score_weighted': [0.4172112154271234], 'precision_score_micro': [0.4734900875979714], 'precision_score_macro': [0.3257913876389222], 'precision_score_weighted': [0.4715278973911036]}


  _warn_prf(average, modifier, msg_start, len(result))


#### Многоклассовая классификация с с GradientBoostingClassifier

In [118]:
from sklearn.ensemble import GradientBoostingClassifier

In [119]:
#соберем наш простой pipeline, но уже с другим классификатором
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

pipeline = Pipeline([('title_selector', FeatureSelector(column='posts')), 
                     ('title_tfidf', TfidfVectorizer()), 
                     ('clf', GradientBoostingClassifier())])

In [120]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('title_selector', FeatureSelector(column='posts')),
                ('title_tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=N...
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_spli

In [121]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict(X_test)
preds[:10]

array(['INTP', 'INFJ', 'ENFP', 'INTJ', 'ENFP', 'INFP', 'INFP', 'ISFJ',
       'INTJ', 'INFJ'], dtype=object)

In [122]:
models_results['approach'].append('Multi_Grad_Boost')
models_results['f1_score_micro'].append(f1_score(y_test, preds, average='micro'))
models_results['f1_score_macro'].append(f1_score(y_test, preds, average='macro'))
models_results['f1_score_weighted'].append(f1_score(y_test, preds, average='weighted'))
models_results['precision_score_micro'].append(precision_score(y_test, preds, average='micro'))
models_results['precision_score_macro'].append(precision_score(y_test, preds, average='macro'))
models_results['precision_score_weighted'].append(precision_score(y_test, preds, average='weighted'))

print(models_results)

{'approach': ['Multi_LogReg', 'Multi_Grad_Boost'], 'f1_score_micro': [0.4734900875979714, 0.4698017519594283], 'f1_score_macro': [0.1888899973765767, 0.3095925559477476], 'f1_score_weighted': [0.4172112154271234, 0.46030986850417094], 'precision_score_micro': [0.4734900875979714, 0.4698017519594283], 'precision_score_macro': [0.3257913876389222, 0.34547026387509105], 'precision_score_weighted': [0.4715278973911036, 0.4699867494759581]}


У GradientBoostingClassifier получились лучшие показатели

#### Многоклассовая классификация с RandomForestClassifier

In [123]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
#теперь соберем pipeline с RandomForestClassifier
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

pipeline = Pipeline([('title_selector', FeatureSelector(column='posts')), 
                     ('title_tfidf', TfidfVectorizer()), 
                     ('clf', RandomForestClassifier(n_estimators=1000))])

In [126]:
#обучим наш пайплайн и запишем прогнозы
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

In [127]:
models_results['approach'].append('Multi_RandFor')
models_results['f1_score_micro'].append(f1_score(y_test, preds, average='micro'))
models_results['f1_score_macro'].append(f1_score(y_test, preds, average='macro'))
models_results['f1_score_weighted'].append(f1_score(y_test, preds, average='weighted'))
models_results['precision_score_micro'].append(precision_score(y_test, preds, average='micro'))
models_results['precision_score_macro'].append(precision_score(y_test, preds, average='macro'))
models_results['precision_score_weighted'].append(precision_score(y_test, preds, average='weighted'))

print(models_results)

{'approach': ['Multi_LogReg', 'Multi_Grad_Boost', 'Multi_RandFor'], 'f1_score_micro': [0.4734900875979714, 0.4698017519594283, 0.33148916551406177], 'f1_score_macro': [0.1888899973765767, 0.3095925559477476, 0.09421681326234235], 'f1_score_weighted': [0.4172112154271234, 0.46030986850417094, 0.25297778825380945], 'precision_score_micro': [0.4734900875979714, 0.4698017519594283, 0.33148916551406177], 'precision_score_macro': [0.3257913876389222, 0.34547026387509105, 0.22372854536744724], 'precision_score_weighted': [0.4715278973911036, 0.4699867494759581, 0.43603957808592186]}


  _warn_prf(average, modifier, msg_start, len(result))


### Попробуем пойти другим путем. 16 бинарных подзадач с LogisticRegression

Вместо решения задачи многоклассовой классификации попробуем разделить на 16 бинарных задач и выберем тот тип по итогу, вероятность которого больше всех.

In [138]:
types = pd.get_dummies(comments['type'])
types.head()

Unnamed: 0,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [139]:
list_types = list(types.columns)
print(list_types)

['ENFJ', 'ENFP', 'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP', 'INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP']


In [140]:
comments_binary = pd.concat([comments['posts'], types], axis=1)
comments_binary.head()

Unnamed: 0,posts,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
0,moments sportscenter plays pranks lifechanging...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,finding lack posts alarming sex boring positio...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,good blessing curse absolutely positive friend...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,dear enjoyed conversation day esoteric gabbing...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,fired silly misconception approaching logicall...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [141]:
X_train, X_test, y_train, y_test = train_test_split(comments_binary, comments_binary[list_types], random_state=0)

In [142]:
# в цикле обучим pipeline с LogisticRegression, добавляя новую колонку в  df_prob с вероятностью отнесения данного пользователя к определенному типу
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

df_prob = pd.DataFrame()

for i in list_types:
    
    pipeline = Pipeline([('title_selector', FeatureSelector(column='posts')), 
                         ('title_tfidf', TfidfVectorizer()), 
                         ('clf', LogisticRegression())])
    pipeline.fit(X_train, y_train[i])
    df_prob[i] = pipeline.predict_proba(X_test)[:,1]

y_test нам нужно вернуть в исходное состояние (до get_dummie) для сравнения с прогнозом. Для этого воспользуемся функцией decode.

In [143]:
def decode(row):
    for c in y_test.columns:
        if row[c] == 1:
            return c

In [144]:
y_test_multi = y_test.apply(decode, axis=1)
y_test_multi

4587    ISFP
2786    INFJ
2813    ENFP
3705    INTP
5957    ISFP
        ... 
7256    ISFP
2645    ENTP
4773    ISTP
7242    INFP
6523    INFJ
Length: 2169, dtype: object

In [145]:
df_prob.head()

Unnamed: 0,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
0,0.024636,0.087665,0.022872,0.047351,0.003814,0.006513,0.004064,0.008422,0.160376,0.269963,0.061839,0.097951,0.022631,0.03175,0.031102,0.032893
1,0.03326,0.11428,0.015315,0.042538,0.00404,0.00487,0.004281,0.007677,0.1999,0.328585,0.07535,0.048405,0.021498,0.027037,0.019714,0.023968
2,0.02559,0.251114,0.022937,0.026617,0.003782,0.005482,0.004016,0.009444,0.219686,0.282922,0.106672,0.021301,0.018505,0.028482,0.023056,0.027855
3,0.017657,0.029069,0.023216,0.065,0.00412,0.00503,0.004365,0.009766,0.07907,0.100114,0.190354,0.344625,0.016501,0.019668,0.027364,0.052098
4,0.026842,0.176722,0.022939,0.080773,0.004361,0.006038,0.004663,0.010947,0.165552,0.194345,0.050182,0.073032,0.023291,0.042548,0.016626,0.035307


Теперь нам нужно выделить название колонки с максимальной вероятностью

In [146]:
df_prob['predict'] = df_prob.columns[df_prob.values.argsort(1)[:, -1]]
df_prob['predict']

0       INFP
1       INFP
2       INFP
3       INTP
4       INFP
        ... 
2164    INFP
2165    ENTP
2166    INFP
2167    INFP
2168    INFJ
Name: predict, Length: 2169, dtype: object

In [147]:
models_results['approach'].append('Binary_LogReg')
models_results['f1_score_micro'].append(f1_score(y_test_multi, df_prob['predict'], average='micro'))
models_results['f1_score_macro'].append(f1_score(y_test_multi, df_prob['predict'], average='macro'))
models_results['f1_score_weighted'].append(f1_score(y_test_multi, df_prob['predict'], average='weighted'))
models_results['precision_score_micro'].append(precision_score(y_test_multi, df_prob['predict'], average='micro'))
models_results['precision_score_macro'].append(precision_score(y_test_multi, df_prob['predict'], average='macro'))
models_results['precision_score_weighted'].append(precision_score(y_test_multi, df_prob['predict'], average='weighted'))

print(models_results)

{'approach': ['Multi_LogReg', 'Multi_Grad_Boost', 'Multi_RandFor', 'Binary_LogReg', 'Binary_LogReg'], 'f1_score_micro': [0.4734900875979714, 0.4698017519594283, 0.33148916551406177, 0.4633471645919779], 'f1_score_macro': [0.1888899973765767, 0.3095925559477476, 0.09421681326234235, 0.1814749363220307], 'f1_score_weighted': [0.4172112154271234, 0.46030986850417094, 0.25297778825380945, 0.4057896608579111], 'precision_score_micro': [0.4734900875979714, 0.4698017519594283, 0.33148916551406177, 0.4633471645919779], 'precision_score_macro': [0.3257913876389222, 0.34547026387509105, 0.22372854536744724, 0.3289755182357287], 'precision_score_weighted': [0.4715278973911036, 0.4699867494759581, 0.43603957808592186, 0.4731197404661596]}


  _warn_prf(average, modifier, msg_start, len(result))


#### Теперь попробуем с GradientBoostingClassifier

In [148]:
X_train, X_test, y_train, y_test = train_test_split(comments_binary, comments_binary[list_types], random_state=0)

In [149]:
#Теперь также в цикле обучаем GradientBoostingClassifier
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

df_prob = pd.DataFrame()

for i in list_types:
    
    pipeline = Pipeline([('title_selector', FeatureSelector(column='posts')), 
                         ('title_tfidf', TfidfVectorizer()), 
                         ('clf', GradientBoostingClassifier())])
    pipeline.fit(X_train, y_train[i])
    df_prob[i] = pipeline.predict_proba(X_test)[:,1]

In [150]:
y_test_multi = y_test.apply(decode, axis=1)
df_prob.head()

Unnamed: 0,ENFJ,ENFP,ENTJ,ENTP,ESFJ,ESFP,ESTJ,ESTP,INFJ,INFP,INTJ,INTP,ISFJ,ISFP,ISTJ,ISTP
0,0.00372,0.048378,0.007618,0.028297,4e-06,1.1e-05,7.294238e-07,0.00026,0.097797,0.218575,0.055623,0.273383,0.002577,0.009175,0.005609,0.015284
1,0.00372,0.033593,0.007618,0.028297,4e-06,1.1e-05,7.294238e-07,0.00026,0.425311,0.103165,0.044921,0.038749,0.002577,0.009175,0.005609,0.015284
2,0.00372,0.676895,0.007618,0.028297,4e-06,1.1e-05,7.294238e-07,0.00026,0.121794,0.149638,0.332135,0.050254,0.002577,0.009175,0.005609,0.015284
3,0.00372,0.032095,0.007618,0.049814,4e-06,1.1e-05,7.294238e-07,0.00026,0.066874,0.086341,0.265972,0.556502,0.002577,0.009175,0.005609,0.07148
4,0.00372,0.301662,0.007618,0.059911,4e-06,1.1e-05,7.294238e-07,0.00026,0.097322,0.07195,0.043868,0.061104,0.002577,0.009175,0.005609,0.015284


In [151]:
df_prob['predict'] = df_prob.columns[df_prob.values.argsort(1)[:, -1]]
df_prob['predict']

0       INTP
1       INFJ
2       ENFP
3       INTP
4       ENFP
        ... 
2164    ESTJ
2165    ENTP
2166    INFP
2167    INFP
2168    INFJ
Name: predict, Length: 2169, dtype: object

In [152]:
models_results['approach'].append('Binary_GradBoost')
models_results['f1_score_micro'].append(f1_score(y_test_multi, df_prob['predict'], average='micro'))
models_results['f1_score_macro'].append(f1_score(y_test_multi, df_prob['predict'], average='macro'))
models_results['f1_score_weighted'].append(f1_score(y_test_multi, df_prob['predict'], average='weighted'))
models_results['precision_score_micro'].append(precision_score(y_test_multi, df_prob['predict'], average='micro'))
models_results['precision_score_macro'].append(precision_score(y_test_multi, df_prob['predict'], average='macro'))
models_results['precision_score_weighted'].append(precision_score(y_test_multi, df_prob['predict'], average='weighted'))

In [154]:
models_results

{'approach': ['Multi_LogReg',
  'Multi_Grad_Boost',
  'Multi_RandFor',
  'Binary_LogReg',
  'Binary_LogReg',
  'Binary_GradBoost'],
 'f1_score_micro': [0.4734900875979714,
  0.4698017519594283,
  0.33148916551406177,
  0.4633471645919779,
  0.45550945136007376],
 'f1_score_macro': [0.1888899973765767,
  0.3095925559477476,
  0.09421681326234235,
  0.1814749363220307,
  0.2875672552887848],
 'f1_score_weighted': [0.4172112154271234,
  0.46030986850417094,
  0.25297778825380945,
  0.4057896608579111,
  0.4448779214022036],
 'precision_score_micro': [0.4734900875979714,
  0.4698017519594283,
  0.33148916551406177,
  0.4633471645919779,
  0.45550945136007376],
 'precision_score_macro': [0.3257913876389222,
  0.34547026387509105,
  0.22372854536744724,
  0.3289755182357287,
  0.3158317851912673],
 'precision_score_weighted': [0.4715278973911036,
  0.4699867494759581,
  0.43603957808592186,
  0.4731197404661596,
  0.4561331838159128]}

In [156]:
pd.DataFrame(data=models_results)

Unnamed: 0,approach,f1_score_micro,f1_score_macro,f1_score_weighted,precision_score_micro,precision_score_macro,precision_score_weighted
0,Multi_LogReg,0.47349,0.18889,0.417211,0.47349,0.325791,0.471528
1,Multi_Grad_Boost,0.469802,0.309593,0.46031,0.469802,0.34547,0.469987
2,Multi_RandFor,0.331489,0.094217,0.252978,0.331489,0.223729,0.43604
3,Binary_LogReg,0.463347,0.181475,0.40579,0.463347,0.328976,0.47312
4,Binary_GradBoost,0.455509,0.287567,0.444878,0.455509,0.315832,0.456133
