In [102]:
import json
import nltk
import os
import pymorphy2
import nltk
import re
import numpy as np
import gensim

import gensim.downloader
from gensim.models import Word2Vec

from nltk.corpus import stopwords

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import NuSVC

In [114]:
ruscorpora_model = gensim.downloader.load('word2vec-ruscorpora-300')
ruscorpora_model.save('word2vec-ruscorpora-300.emb')

In [107]:
type(ruscorpora_model)

gensim.models.keyedvectors.KeyedVectors

In [33]:
nltk.download(
    'stopwords',
    download_dir=os.getcwd(),
)
nltk.data.path.append(os.getcwd())

stop_words = set(stopwords.words('russian'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jupiter/workbench/python/search...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
regex = '[^A-Za-z\sаАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ]'
regexRussian = '[^\sаАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ]'
morph = pymorphy2.MorphAnalyzer()

def process_fn(text: str):
    text = re.sub(regex, "", text).lower()
    new_text = []
    for word in text.split(' '):
        if word not in stop_words:
            word = morph.parse(word)[0].normal_form
            new_text.append(word)
    text = ' '.join(new_text)
    return text

def process_fn_gensim(text: str):
    text = ' '.join(re.sub(regexRussian, "", text).lower().split())
    new_text = []
    for word in text.split(' '):
        if word not in stop_words:
            tag = morph.parse(word)[0]
            word = tag.normal_form
            pos = tag.tag.POS
            if pos is None:
                continue
            word = '_'.join([word, pos])
            new_text.append(word)
    text = ' '.join(new_text)
    return text

In [78]:
class TextData:
    def __init__(self, filepath, process_fn = process_fn):
        with open(filepath) as file:
            data = json.loads(file.read())
        self.__labels = np.array([])
        self.__raw_text = []
        for d in data['data']:
            self.__labels = np.append(self.__labels, d['evaluation'])
            self.__raw_text.append(d['speech'])
        
    @property
    def labels(self):
        return self.__labels
    
    @property
    def text(self):
        return self.__raw_text
    
    @property
    def text_and_labels(self):
        return list(zip(self.text, self.labels))

In [115]:
class EmbVectorizer:
    def __init__(self, text: list, preprocessor = lambda x : x):
        self.ruscorpora_model = Word2Vec.load('word2vec-ruscorpora-300.emb')
        for i, s in enumerate(text):
            text[i] = preprocessor(s)
        self.__text = text
        self.ruscorpora_model.train(self.__text)
        
    @property
    def text(self):
        return self.__text

In [116]:
a = EmbVectorizer(train_data.text, preprocessor=process_fn_gensim)

AttributeError: Model of type <class 'gensim.models.keyedvectors.KeyedVectors'> can't be loaded by <class 'gensim.models.word2vec.Word2Vec'>

In [79]:
train_data = TextData('data/new_eval_train.json')
test_data = TextData('data/new_eval_test.json')

In [122]:
a = CountVectorizer().fit(train_data.text)
a.

AttributeError: 'CountVectorizer' object has no attribute 'shape'

In [13]:
steps = [
    ('vect', CountVectorizer(preprocessor=process_fn)),
    ('tfidf', TfidfTransformer()),
]

In [18]:
# Stochastic Gradient Descent
steps_sgd = steps.copy()
steps_sgd.append(('clf', SGDClassifier()))
pipeline_sgd = Pipeline(steps_sgd)

parameters = {
    # params for vect
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    # params for tfidf
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    # params for clf
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    'clf__max_iter': (10, 50, 80),
}

clf_sgd = GridSearchCV(
    estimator=pipeline_sgd, 
    param_grid=parameters,
    n_jobs=-1, verbose=1,
)
clf_sgd.fit(train_data.text, train_data.labels)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


KeyboardInterrupt: 

In [22]:
clf_sgd.score(train_data.text, train_data.labels)

0.998356807511737

In [23]:
clf_sgd.score(test_data.text, test_data.labels)

0.532

((-1, 1864), (0, 1281), (1, 1115))

In [56]:
# Multinomial Naive Bayes
steps_mnb = steps.copy()
steps_mnb.append(('clf', MultinomialNB()))
pipeline_mnb = Pipeline(steps_mnb)

train_elem, train_count = np.unique(train_data.labels.astype('int64'), return_counts=True)
all_train_count = float(sum(train_count))
class_prior = tuple(zip(train_elem, map(lambda x: float(x) / all_train_count, train_count)))

parameters_mnb = {
    # params for vect
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    # params for tfidf
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    # params for clf
    'clf__alpha': np.linspace(0.0, 1.0, 11),
    'clf__fit_prior': (True, False),
    # 'clf__class_prior': class_prior,
}

clf_mnb = GridSearchCV(
    estimator=pipeline_mnb,
    param_grid=parameters_mnb,
    n_jobs=-1, verbose=1,
)
clf_mnb.fit(train_data.text, train_data.labels)

Fitting 5 folds for each of 792 candidates, totalling 3960 fits


GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(preprocessor=<function process_fn at 0x131e10f70>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'clf__alpha': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'clf__fit_prior': (True, False),
                         'tfidf__norm': ('l1', 'l2'),
                         'tfidf__use_idf': (True, False),
                         'vect__max_df': (0.5, 0.75, 1.0),
                         'vect__ngram_range': ((1, 1), (1, 2), (1, 3))},
             verbose=1)

In [57]:
clf_mnb.score(train_data.text, train_data.labels)

0.9943661971830986

In [58]:
clf_mnb.score(test_data.text, test_data.labels)

0.542

In [62]:
clf_mnb.best_params_

{'clf__alpha': 0.1,
 'clf__fit_prior': False,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True,
 'vect__max_df': 0.5,
 'vect__ngram_range': (1, 2)}

In [69]:
# SVM 
steps_svm = steps.copy()
steps_svm.append(('clf', NuSVC()))
pipeline_svm = Pipeline(steps_svm)

parameters_svm = {
    # params for vect
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    # params for tfidf
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    # params for clf
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'),
    'clf__degree': (3,4,5),
    'clf__gamma': ('scale',),
    'clf__probability': (True, False),
    'clf__cache_size': (2000,),
    'clf__class_weight': ('balanced', None),
    'clf__verbose': (True,),
    'clf__decision_function_shape': ('ovo', 'ovr')
}

clf_svm = GridSearchCV(
    estimator=pipeline_svm,
    param_grid=parameters_svm,
    n_jobs=-1, verbose=1,
)
clf_svm.fit(train_data.text, train_data.labels)

Fitting 5 folds for each of 4320 candidates, totalling 21600 fits


KeyboardInterrupt: 

In [70]:
clf_svm.score(train_data.text, train_data.labels)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
clf_svm.score(test_data.text, test_data.labels)