In [25]:
import json
import nltk
import os
import pymorphy2
import nltk
import re
import numpy as np
import gensim
import spacy

import gensim.downloader
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.corpus import stopwords

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

from sklearn.linear_model import SGDClassifier

In [2]:
nltk.download(
    'stopwords',
    download_dir=os.getcwd(),
)
nltk.data.path.append(os.getcwd())

stop_words = set(stopwords.words('russian'))
stop_words.add('свой')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jupiter/workbench/python/2sem/hw_10_info_ret...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
nlp = spacy.load('ru_core_news_lg')

In [4]:
regex = '[^A-Za-z\sаАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ]'
regexRussian = '[^\sаАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ]'
rgRu = re.compile(regexRussian)
rg = re.compile(regex)
morph = pymorphy2.MorphAnalyzer()

In [5]:
pos_convert_dict = {
    'ADVB': 'ADV',
    'ADJF': 'ADJ',
    'NPRO': 'NOUN'
}

In [6]:
def tokenization_condition(token):
    return all([
        token.ent_type == 0,
        rgRu.match(token.text) is None,
        token.is_alpha,
        not token.like_num,
        not token.like_email,
    ])

def tokenize_spacy(text: str):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if tokenization_condition(token):
            token = morph.parse(token.text)[0].normal_form
            tokens.append(token)
    return tokens

def tokenize_gensim(text: str):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if tokenization_condition(token):
            tag = morph.parse(token.text)[0]
            token = tag.normal_form
            tokens.append(token)
    return tokens

def tokenize_gensim_pretrained(text: str):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if tokenization_condition(token):
            tag = morph.parse(token.text)[0]
            token = tag.normal_form
            pos = tag.tag.POS
            if pos_convert_dict.get(pos) is not None:
                pos = pos_convert_dict[pos]
            if pos is not None:
                token = '_'.join([token, pos])
            tokens.append(token)
    return tokens

def process_fn(text: str):
    doc = nlp(text)
    text = re.sub(regex, "", text).lower()
    new_text = []
    for word in text.split(' '):
        if word not in stop_words:
            word = morph.parse(word)[0].normal_form
            new_text.append(word)
    text = ' '.join(new_text)
    return text

def process_fn_gensim(text: str):
    text = ' '.join(re.sub(regexRussian, "", text).lower().split())
    new_text = []
    for word in text.split(' '):
        if word not in stop_words:
            tag = morph.parse(word)[0]
            word = tag.normal_form
            pos = tag.tag.POS
            if pos is None:
                continue
            if pos == 'ADVB':
                pos = 'ADV'
            if pos == 'ADJF':
                pos = 'ADJ'
            word = '_'.join([word, pos])
            new_text.append(word)
    text = ' '.join(new_text)
    return text

In [7]:
class TextData:
    def __init__(self, filepath, process_fn = process_fn):
        with open(filepath) as file:
            data = json.loads(file.read())
        self.__labels = np.array([])
        self.__raw_text = []
        for d in data['data']:
            self.__labels = np.append(self.__labels, d['evaluation'])
            self.__raw_text.append(d['speech'])
        
    @property
    def labels(self):
        return self.__labels
    
    @property
    def text(self):
        return self.__raw_text
    
    @property
    def text_and_labels(self):
        return list(zip(self.text, self.labels))

In [8]:
class EmbVectorizer:
    def __init__(self, preprocessor = None, tokenizer = None, stop_words = None):
        self.__preprocessor = preprocessor
        if preprocessor is None:
            self.__preprocessor = lambda x : x
            
        self.__tokenizer = tokenizer
        if tokenizer is None:
            self.__tokenizer = lambda x : x.split()
            
        self.__stop_words = stop_words
        if stop_words is None:
            self.__stop_words = {}
            
    def __process(self, raw_documents, y = None):
        raw_documents = list(map(self.__preprocessor, raw_documents))
        raw_documents = list(map(lambda x: ' '.join(list(filter(lambda x: x not in self.__stop_words, x.split()))), raw_documents))
        tokens = list(map(self.__tokenizer, raw_documents))
        return tokens
        
    def fit(self, raw_documents, y = None):
        self.__fit(raw_documents)
        return self
        
    def transform(self, raw_documents):
        return self.__transform(raw_documents)
    
    def __fit(self, raw_documents):
        tokens = self.__process(raw_documents)
        docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
        self.__model = Doc2Vec(docs, vector_size=300, window=5, min_count=0, workers=4)
        
    def __transform(self, tokens: list):
        vec = []
        for token in self.__process(tokens):
            vec.append(self.__model.infer_vector(token))
        return np.array(vec)

In [9]:
class PretrainedVectorizer:
    ruscorpora_model = gensim.downloader.load('word2vec-ruscorpora-300')
    def __init__(self, preprocessor = None, tokenizer = None, stop_words = None):
        self.__preprocessor = preprocessor
        if preprocessor is None:
            self.__preprocessor = lambda x : x
            
        self.__tokenizer = tokenizer
        if tokenizer is None:
            self.__tokenizer = lambda x : x.split()
            
        self.__stop_words = stop_words
        if stop_words is None:
            self.__stop_words = {}
            
    def fit(self, raw_documents, y = None):
        return self
    
    def transform(self, raw_documents):
        return self.__transform(self.__process(raw_documents))
    
    def __process(self, raw_documents, y = None):
        raw_documents = list(map(self.__preprocessor, raw_documents))
        raw_documents = list(map(lambda x: ' '.join(list(filter(lambda x: x not in self.__stop_words, x.split()))), raw_documents))
        return list(map(self.__tokenizer, raw_documents))
          
    def __transform(self, tokens: list):
        vector_bag = []
        for tt in tokens:
            doc_vector = []
            for t in tt:
                if PretrainedVectorizer.ruscorpora_model.has_index_for(t):
                    doc_vector.append(PretrainedVectorizer.ruscorpora_model.get_vector(t))
            if len(doc_vector) == 0:
                vector_bag.append(list(np.zeros((300))))
            else:
                vector_bag.append(list(np.sum(doc_vector, axis=0) / len(doc_vector))) 
        return np.array(vector_bag)

In [10]:
class Model:
    def __init__(self, pipeline = None, params = {}):
        self.__clf = GridSearchCV(
            estimator=pipeline, 
            param_grid=params,
            n_jobs=-1, verbose=1,
        )
        
    def fit(self, raw_documents, y = None):
        self.__clf.fit(raw_documents, y)
        
    def precision_recall_f1_support(self, raw_documents, y = None, average = None):
        return precision_recall_fscore_support(
            y, 
            self.__clf.predict(raw_documents),
            average=average,
        )

In [11]:
train_data = TextData('data/new_eval_train.json')
test_data = TextData('data/new_eval_test.json')

In [12]:
steps_binary = [
    ('vect', CountVectorizer(
        tokenizer=tokenize_spacy,
        binary=True,
        #stop_words=list(stop_words),               
    ))
]

steps_count = [
    ('vect', CountVectorizer(
        tokenizer=tokenize_spacy,
        #stop_words=list(stop_words),
    ))
]

steps_tfidf = [
    ('vect', CountVectorizer(tokenizer=tokenize_spacy)),
    ('tfidf', TfidfTransformer())
]

steps_emb = [
    ('emb', EmbVectorizer(
        tokenizer=tokenize_gensim, 
        stop_words=stop_words
    )),
]

steps_pretrained = [
    ('emb', PretrainedVectorizer(
        tokenizer=tokenize_gensim_pretrained, 
        stop_words=stop_words
    ))
]

best_parameters_sgd =  {
    # params for clf
    'clf__alpha': (0.0001,),
    'clf__average': (True,),
    'clf__eta0': (0.01,),
    'clf__learning_rate': ('optimal',),
    'clf__max_iter': (200, 250, 275, 1000,),
    'clf__penalty': ('l2',),
}

optimization_parameters_sgd_lg = {
    # params for clf
    'clf__loss': ('hinge', 'modified_huber', 'squared_hinge' , 'log'),
    'clf__learning_rate': ('optimal', 'invscaling', 'adaptive'),
    'clf__alpha': (0.0001, 0.00001),
    'clf__epsilon': (0.1, 0.01, 0.001),
    'clf__penalty': ('l2', 'l1', 'elasticnet'),
    'clf__max_iter': (275,1000,1500,2000),
    'clf__average': (True, False),
    'clf__eta0': (0.01, 0.001, 0.0001)
}

optimization_parameters_sgd_sm = {
    # params for clf
    'clf__loss': ( 'squared_hinge' ,),
    'clf__learning_rate': ('optimal', 'adaptive'),
    'clf__alpha': (0.0001, 0.00001),
    'clf__epsilon': (0.1, 0.01),
    'clf__penalty': ('l2', 'l1', 'elasticnet'),
    'clf__max_iter': (250, 275, 1000),
    'clf__average': (True, False),
    'clf__eta0': (0.01, 0.001, 0.0001)
}

In [14]:
# Stochastic Gradient Descent (pretrained emb)
steps_sgd = steps_pretrained.copy()
steps_sgd.append(('clf', SGDClassifier()))
    
sgd_pretrained_emb = Model(
    pipeline = Pipeline(steps_sgd),
    params = best_parameters_sgd,
)

sgd_pretrained_emb.fit(train_data.text, train_data.labels)
sgd_pretrained_emb.precision_recall_f1_support(test_data.text, test_data.labels)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




(0.6501681042253663, 0.653837743275749, 0.6288505591002959, None)

In [27]:
print(classification_report(
    test_data.labels,
    sgd_pretrained_emb.__dict__['_Model__clf'].predict(test_data.text),
))

              precision    recall  f1-score   support

        -1.0       0.67      0.86      0.75      1890
         0.0       0.64      0.29      0.40      1235
         1.0       0.64      0.69      0.66      1448

    accuracy                           0.65      4573
   macro avg       0.65      0.61      0.60      4573
weighted avg       0.65      0.65      0.63      4573



In [15]:
# Stochastic Gradient Descent (emb)
steps_sgd = steps_emb.copy()
steps_sgd.append(('clf', SGDClassifier()))
    
sgd_emb = Model(
    pipeline = Pipeline(steps_sgd),
    params = best_parameters_sgd,
)

sgd_emb.fit(train_data.text, train_data.labels)
sgd_emb.precision_recall_f1_support(test_data.text, test_data.labels)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


  _warn_prf(average, modifier, msg_start, len(result))


(0.3213979301524614, 0.4198556746118522, 0.2796664314613638, None)

In [28]:
print(classification_report(
    test_data.labels,
    sgd_emb.__dict__['_Model__clf'].predict(test_data.text),
))

              precision    recall  f1-score   support

        -1.0       0.42      0.96      0.58      1890
         0.0       1.00      0.00      0.00      1235
         1.0       0.47      0.07      0.12      1448

    accuracy                           0.42      4573
   macro avg       0.63      0.34      0.24      4573
weighted avg       0.59      0.42      0.28      4573



In [16]:
# Stochastic Gradient Descent (binary)
steps_sgd = steps_binary.copy()
steps_sgd.append(('clf', SGDClassifier()))

sgd_binary = Model(
    pipeline = Pipeline(steps_sgd),
    params = best_parameters_sgd,
)

sgd_binary.fit(train_data.text, train_data.labels)
sgd_binary.precision_recall_f1_support(test_data.text, test_data.labels)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




(0.5881789514427936, 0.5982943363218893, 0.5870466519275348, None)

In [29]:
print(classification_report(
    test_data.labels,
    sgd_binary.__dict__['_Model__clf'].predict(test_data.text),
))

              precision    recall  f1-score   support

        -1.0       0.67      0.73      0.70      1890
         0.0       0.49      0.33      0.39      1235
         1.0       0.56      0.66      0.61      1448

    accuracy                           0.60      4573
   macro avg       0.58      0.57      0.57      4573
weighted avg       0.59      0.60      0.59      4573



In [17]:
# Stochastic Gradient Descent (count)
steps_sgd = steps_count.copy()
steps_sgd.append(('clf', SGDClassifier()))

sgd_count = Model(
    pipeline = Pipeline(steps_sgd),
    params = best_parameters_sgd,
)

sgd_count.fit(train_data.text, train_data.labels)
sgd_count.precision_recall_f1_support(test_data.text, test_data.labels)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




(0.5905770121469165, 0.6020118084408485, 0.5883272977040023, None)

In [30]:
print(classification_report(
    test_data.labels,
    sgd_count.__dict__['_Model__clf'].predict(test_data.text),
))

              precision    recall  f1-score   support

        -1.0       0.66      0.74      0.70      1890
         0.0       0.50      0.31      0.39      1235
         1.0       0.58      0.67      0.62      1448

    accuracy                           0.60      4573
   macro avg       0.58      0.57      0.57      4573
weighted avg       0.59      0.60      0.59      4573



clf_sgd.score(train_data.text, train_data.labels)

In [18]:
# Stochastic Gradient Descent (tfidf)
steps_sgd = steps_tfidf.copy()
steps_sgd.append(('clf', SGDClassifier()))
pipeline_sgd = Pipeline(steps_sgd)

sgd_tfidf = Model(
    pipeline = Pipeline(steps_sgd),
    params = best_parameters_sgd,
)

sgd_tfidf.fit(train_data.text, train_data.labels)
sgd_tfidf.precision_recall_f1_support(test_data.text, test_data.labels)

Fitting 5 folds for each of 4 candidates, totalling 20 fits




(0.5896424824517709, 0.6015744587797944, 0.5885129181037644, None)

In [31]:
print(classification_report(
    test_data.labels,
    sgd_tfidf.__dict__['_Model__clf'].predict(test_data.text),
))

              precision    recall  f1-score   support

        -1.0       0.66      0.74      0.70      1890
         0.0       0.50      0.32      0.39      1235
         1.0       0.57      0.66      0.61      1448

    accuracy                           0.60      4573
   macro avg       0.58      0.57      0.57      4573
weighted avg       0.59      0.60      0.59      4573

