На этом шаге создадим pipeline с моделью и выгрузим его

In [1]:
import pandas as pd

import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
from sklearn.metrics import f1_score, roc_auc_score, precision_score
import dill

comments = pd.read_csv("mbti_1.csv")
print(comments.shape)
comments['posts'][0]

(8675, 2)


"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

In [3]:
comments.head(3)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...


In [4]:
import re
import numpy as np
from nltk.corpus import stopwords
from razdel import tokenize 
import pymorphy2  

In [5]:
morph = pymorphy2.MorphAnalyzer()

stopword_eng = []

with open('final_stop_words.txt') as f:
    stopword_eng += [w.strip() for w in f.readlines() if w]
    
len(stopword_eng)

2091

In [6]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))
    text = re.sub("\|", ' ', str(text))
    text = re.sub("https?:\/\/\S+", '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_eng] # [6]
    
    return words_lem_without_stopwords

In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class TextPreparater(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X[self.key] = X[self.key].apply(lambda x: clean_text(x), 1)
        X[self.key] = X[self.key].apply(lambda x: lemmatization(x), 1)
        X[self.key] = X[self.key].apply(lambda x: ' '.join(x), 1)
        return X

In [8]:
feature = 'posts'
target = 'type'

In [9]:
feats = Pipeline([
                ('preparator', TextPreparater('posts', '')),
                ('selector', ColumnSelector(key='posts')),
                ('tfidf', TfidfVectorizer())
            ])

In [10]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(comments, comments['type'], random_state=0)

#### Многоклассовая классификация с с GradientBoostingClassifier

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

In [12]:
#соберем наш финальный pipeline

pipeline = Pipeline([('feature', feats), 
                     ('clf', GradientBoostingClassifier())])

In [13]:
#обучим наш пайплайн
pipeline.fit(X_train, y_train)

  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Pipeline(memory=None,
         steps=[('feature',
                 Pipeline(memory=None,
                          steps=[('preparator',
                                  TextPreparater(key='posts', value='')),
                                 ('selector', ColumnSelector(key='posts')),
                                 ('tfidf',
                                  TfidfVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.float64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=1...
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_

In [14]:
#наши прогнозы для тестовой выборки
preds = pipeline.predict(X_test)
preds[:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array(['INTP', 'INFJ', 'ENFP', 'INTJ', 'ENFP', 'INFP', 'INFP', 'ISFJ',
       'INTJ', 'INFJ'], dtype=object)

In [15]:
print(f1_score(y_test, preds, average='micro'))
print(f1_score(y_test, preds, average='macro'))
print(f1_score(y_test, preds, average='weighted'))
print(precision_score(y_test, preds, average='micro'))
print(precision_score(y_test, preds, average='macro'))
print(precision_score(y_test, preds, average='weighted'))

0.46473029045643155
0.3084648008363499
0.45398283430654857
0.46473029045643155
0.34964422308508275
0.4628628098853054


Сохраним модель (пайплайн)

In [16]:
with open("mbti_GradBoost_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)