In [7]:
import sklearn
import io
import os
from getpass import getuser
import re
import math
import xml.etree.ElementTree as ET
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import pymorphy2
import nltk
import string
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn import preprocessing
from keras.utils import to_categorical
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
TEST_FILENAME = os.path.join('/home',getuser(), 'Tonal_Analysis', 'news_sentiment_romip2012-1/test/news_eval_test.xml')
TRAIN_FILENAME = os.path.join('/home',getuser(), 'Tonal_Analysis', 'news_sentiment_romip2012-1/train/news_eval_train.xml')

In [11]:
class Citation:
    def __init__(self, words, evaluation: str):
        self.words = words
        self.eval = evaluation

    def __str__(self):
        return f'{self.eval}: {self.words}'

    def __repr__(self):
        return self.__str__()

In [18]:
def tokenize(file_text, remove_words=False):
    tokens = nltk.word_tokenize(file_text)
    tokens = [i for i in tokens if ( i not in string.punctuation )]
    stop_words = stopwords.words('russian')
    if remove_words:
        words_to_remove = [ 'все', 'нет', 'ни',  'ничего', 'без', 'никогда', 'наконец', 'больше', 'хорошо', 'лучше','нельзя', 'более', 'всегда', 'конечно', 'всю', 'такой', 'впрочем', 'так', 'вот', 'можно', 'даже', 'разве']
        for word in words_to_remove:
            stop_words.remove(word)
    tokens = [morph.parse(re.sub(r'[^\w\s]', '', i).lower())[0].normal_form for i in tokens if ( i not in stop_words )]
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    for item in tokens:
        if '' == item or item.isspace():
            while item in tokens:
                tokens.remove(item)
    return tokens



def parse_xml(file: str) -> list:
    tree = ET.parse(file)
    root = tree.getroot()
    corpus = []
    citations = []
    for elem in root.iter('speech'):
        corpus.append(tokenize(elem.text))
    i = 0
    for elem in root.iter('evaluation'):
        pair_eval = elem.text.replace("\n", "")
        pair_eval = ''.join(pair_eval.split())
        # print (corpus[i])
        if pair_eval in ['0', '+', '-']:
            citation = Citation(corpus[i], pair_eval)
            citations.append(citation)

        i += 1
        # evaluate_id.append(elem.text)
    return citations

In [14]:
morph = pymorphy2.MorphAnalyzer()

In [19]:
%%time
citations_train = parse_xml(TRAIN_FILENAME)
citations_test = parse_xml(TEST_FILENAME)

CPU times: user 56.1 s, sys: 284 ms, total: 56.4 s
Wall time: 56.4 s


In [50]:
def vectorized_corpus(parsed_list_train, parsed_list_test):
    out_list_train = []
    out_list_test = []
    eval_list = []
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    
    for cite in parsed_list_train:
        citation = cite.words
        evaluation = cite.eval
        citation = ' '.join(citation).strip()
        out_list_train.append(citation)
        y_train.append(evaluation)
        
    for cite in parsed_list_test:
        citation = cite.words
        evaluation = cite.eval
        citation = ' '.join(citation).strip()
        out_list_test.append(citation)
        y_test.append(evaluation)
    
    #vectorizer = CountVectorizer()#булевский
    #vectorizer = HashingVectorizer(n_features=2**17)
    vectorizer = TfidfVectorizer()#tf-idf
    train_data = vectorizer.fit_transform(out_list_train)
    X_train = train_data.toarray()
#     print(out_list_test)
    test_data = vectorizer.transform(out_list_test)
    X_test = test_data.toarray()
    return X_train, y_train, X_test, y_test, vectorizer

X_train, y_train, X_test, y_test, vectorizer = vectorized_corpus(citations_train,citations_test)

In [21]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.0, class_weight='balanced', dual=False, fit_intercept=True, tol=0.0001,
         intercept_scaling=1, max_iter=100, penalty='l2', random_state=0, solver='saga', multi_class='ovr',  warm_start=True)

In [22]:
model.fit(X_train, y_train)
print(model)

LogisticRegression(class_weight='balanced', multi_class='ovr', random_state=0,
                   solver='saga', warm_start=True)


In [23]:
# make predictions
expected = y_test
predicted = model.predict(X_test)

In [24]:
predicted

array(['+', '0', '0', ..., '+', '0', '-'], dtype='<U1')

In [25]:
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(model.score(X_test,y_test))


              precision    recall  f1-score   support

           +       0.56      0.72      0.63      1448
           -       0.71      0.66      0.69      1890
           0       0.52      0.40      0.45      1235

    accuracy                           0.61      4573
   macro avg       0.60      0.59      0.59      4573
weighted avg       0.61      0.61      0.60      4573

0.6090094030177127


In [100]:
example = "бог"

In [102]:
model.predict(vectorizer.transform([' '.join(tokenize(example))]).toarray())

array(['+'], dtype='<U1')