In [24]:
import torch
import string
import gensim.downloader as api
import numpy as np
from torch import nn
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

russian_stopwords = stopwords.words("russian")


class MyClassifier:
    def __init__(self, model, tokenizer, word2vec, idx2label):
        self.model = model
        self.tokenizer = tokenizer
        self.idx2label = idx2label
        self.word2vec = word2vec
        self.mean = np.mean(word2vec.vectors, axis=0)
        self.std = np.std(word2vec.vectors, axis=0)

    def get_tokens(self, text):
        return [token for token in self.tokenizer.tokenize(text.lower()) if token not in russian_stopwords and
                all(symbol not in string.punctuation for symbol in token) and len(token) >= 3]

    def get_avg_embedding(self, tokens):
        embedding = [(self.word2vec[token] - self.mean) / self.std for token in tokens if token in self.word2vec]

        if len(embedding) == 0:
            embedding = np.zeros(self.word2vec.vector_size)
        else:
            embedding = np.mean(embedding, axis=0)
        return embedding

    def make_prediction(self, text):
        tokens = self.get_tokens(text)
        embedding = self.get_avg_embedding(tokens)
        pred = self.model(torch.tensor(embedding).float())
        pred_label_idx = torch.argmax(pred).item()
        return self.idx2label[pred_label_idx]


def load_w2v():
    word2vec = api.load("word2vec-ruscorpora-300")
    return word2vec


def load_tokenizer():
    return WordPunctTokenizer()


def load_model(path, embed_size, num_classes):
    model = nn.Sequential(
                      nn.Linear(embed_size, 128),
                      nn.ReLU(),
                      nn.Linear(128, 16),
                      nn.ReLU(),
                      nn.Linear(16, num_classes)
    )
    model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
    return model



In [26]:
idx2label = {0: 'bad', 1: 'good', 2: 'neutral'}
num_classes = len(idx2label)
word2vec = load_w2v()
tokenizer = load_tokenizer()
model = load_model('./model.pt', word2vec.vector_size, num_classes)

classifier = MyClassifier(model, tokenizer, word2vec, idx2label)

text = 'я люблю этот фильм но в конце концовку слили в целом'
classifier.make_prediction(text)

'good'

In [31]:
#вариант с SVM, без нейронки
import pickle

global svmIrisModel

svmIrisFile = open('SVMModel.pckl', 'rb')
svmIrisModel = pickle.load(svmIrisFile)
svmIrisFile.close()

https://blog.socratesk.com/blog/2018/01/29/expose-ML-model-as-REST-API - ссылка на туториал с flask и svm

In [30]:
#remove digits, punctuation and other symbols with regex
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
from pymystem3 import Mystem
from string import punctuation
from nltk.stem.snowball import SnowballStemmer 
#Create lemmatizer and stopwords list
mystem = Mystem() 
russian_stopwords = stopwords.words("russian")


stemmer = SnowballStemmer("russian") 
#Preprocess function
def preprocess_text(text):
    text = (lambda f: re.sub(r'\d|\W', ' ', text))(text)
    #tokens = mystem.lemmatize(text.lower())
    #tokens = text.lower().split(' ')
    tokens = [stemmer.stem(word) for word in text.lower().split(' ')]
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation\
              and len(token) > 3]
    text = " ".join(tokens)
    
    return text

In [35]:
class_prediced = svmIrisModel.predict([text])
class_prediced

array(['neutral'], dtype=object)