In [None]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from sklearn.datasets import load_iris, load_boston
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="ticks")

In [None]:
def accuracy_score_for_classes(
    y_true: np.ndarray,
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса,
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values,
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray,
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [None]:
# Загрузка данных
df = pd.read_csv('D:\\Ботва\\Магистратура\\2сем\\ММО\\лаб6\\imdb_sup.csv')
text_df=df.head(500).append(df.tail(500))
text_df.drop('Rating', axis=1, inplace=True)
text_df.tail(15)

Unnamed: 0,Review,Sentiment
49985,I had great expectations surrounding this movi...,0
49986,It is playing on SHOWTIME right now but is goi...,0
49987,"I love the so-called ""blaxploitation"" films an...",0
49988,"OK, here is the deal. I love action movies and...",0
49989,"Grim instead of amusing, mean-spirited instead...",0
49990,This movie did not give Mr. Bachchan justice. ...,0
49991,Oh dear! The BBC is not about to be knocked of...,0
49992,Ridiculous thriller in which a group of studen...,0
49993,"If you like poor SE, (some) bad acting and a t...",0
49994,Some Plot Spoilers Ahead.<br /><br />The Nashv...,0


In [None]:
#Кодирование целевого признаков
text_df.loc[text_df['Sentiment'] == 1, 'Sentiment'] = 'pol'
text_df.loc[text_df['Sentiment'] == 0, 'Sentiment'] = 'otr'
text_df.sort_values(by=['Review'], inplace=True)
text_df

Unnamed: 0,Review,Sentiment
49906,"'Bloody Birthday' is an odd and, at times, hum...",otr
181,'Renaissance (2006)' was created over a period...,pol
49867,(SMALL SPOILERS) I just bought the DVD of this...,otr
49995,(spoiler) it could be the one the worst movie ...,otr
49834,**********POSSIBLE SPOILER********** Madonna p...,otr
...,...,...
483,this one is out there. Not much to say about i...,pol
283,very few chess movies have been made over the ...,pol
492,"well, i said it all in the summary, i simpley ...",pol
381,when i first heard about this movie i thought ...,pol


In [None]:
text_df.shape

(1000, 2)

In [None]:
# Сформируем общий словарь для обучения моделей из обучающей и тестовой выборки
vocab_list = text_df['Review'].tolist()
vocab_list[1:10]

["'Renaissance (2006)' was created over a period of six years, co-funded by France, Luxembourg and the United Kingdom at a cost of around \x8014 million. The final result is a staggering accomplishment of comic-book style animation, aesthetically similar to what Robert Rodriguez and Frank Miller achieved with 'Sin City (2005),' but this film employed motion capture with live-actors to translate their faces and movements into an entirely animated format. Presented in stark black-and-white, the film looks as though it has been hoisted from the very pages of the graphic novel on which it was based, and the futuristic city of Paris looms ominously above us. Directed by French filmmaker Christian Volckman, in his feature-length debut, 'Renaissance' draws significantly from other films in the science-fiction genre, and the tech-noir storyline isn't something we haven't seen before, but, from a technical standpoint, it is faultless.<br /><br />The year is 2054. The city of Paris is a crumblin

In [None]:
vocabVect = CountVectorizer()
vocabVect.fit(vocab_list)
corpusVocab = vocabVect.vocabulary_
print('Количество сформированных признаков - {}'.format(len(corpusVocab)))

Количество сформированных признаков - 17896


In [None]:
for i in list(corpusVocab)[0:10]:
    print('{}={}'.format(i, corpusVocab[i]))

bloody=1874
birthday=1772
is=8498
an=781
odd=11070
and=798
at=1158
times=16139
humorous=7838
low=9560


# Векторизация признаков на основе CountVectorizer
Подсчитывает количество слов словаря, входящих в данный текст

In [None]:
test_features = vocabVect.transform(vocab_list)

In [None]:
test_features

<1000x17896 sparse matrix of type '<class 'numpy.int64'>'
	with 137926 stored elements in Compressed Sparse Row format>

In [None]:
test_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
# Размер нулевой строки
len(test_features.todense()[0].getA1())

17896

In [None]:
# Непустые значения нулевой строки
[i for i in test_features.todense()[0].getA1() if i>0]

[1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 7,
 1,
 3,
 1,
 5,
 3,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 3,
 6,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 5,
 1,
 1,
 1,
 1,
 1,
 6,
 5,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 7,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 17,
 3,
 2,
 1,
 4,
 2,
 3,
 1,
 1,
 5,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 3,
 1,
 1,
 1,
 1,
 2,
 1]

In [None]:
def VectorizeAndClassify(vectorizers_list, classifiers_list):
    for v in vectorizers_list:
        for c in classifiers_list:
            pipeline1 = Pipeline([("vectorizer", v), ("classifier", c)])
            score = cross_val_score(pipeline1, text_df['Review'], text_df['Sentiment'], scoring='accuracy', cv=3).mean()
            print('Векторизация - {}'.format(v))
            print('Модель для классификации - {}'.format(c))
            print('Accuracy = {}'.format(score))
            print('===========================')

In [None]:
vectorizers_list = [CountVectorizer(vocabulary = corpusVocab)]
classifiers_list = [LogisticRegression(C=3.0), LinearSVC(), KNeighborsClassifier()]
VectorizeAndClassify(vectorizers_list, classifiers_list)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '0069': 2, '007': 3, '01': 4,
                            '06th': 5, '08': 6, '0f': 7, '10': 8, '100': 9,
                            '100th': 10, '101': 11, '102': 12, '10th': 13,
                            '11': 14, '112': 15, '11th': 16, '12': 17, '13': 18,
                            '13th': 19, '14': 20, '14th': 21, '15': 22,
                            '150': 23, '16': 24, '1600s': 25, '16éme': 26,
                            '17': 27, '1710': 28, '18': 29, ...})
Модель для классификации - LogisticRegression(C=3.0)
Accuracy = 0.7819855783927641
Векторизация - CountVectorizer(vocabulary={'00': 0, '000': 1, '0069': 2, '007': 3, '01': 4,
                            '06th': 5, '08': 6, '0f': 7, '10': 8, '100': 9,
                            '100th': 10, '101': 11, '102': 12, '10th': 13,
                            '11': 14, '112': 15, '11th': 16, '12': 17, '13': 18,
                            '13th': 19, '14': 20, '14th'

# Разделим на обучающую и тестовую выборки

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_df['Review'], text_df['Sentiment'], test_size=0.5, random_state=1)

In [None]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v),
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [None]:
sentiment(CountVectorizer(), LogisticRegression(C=3.0))

Метка 	 Accuracy
otr 	 0.7765151515151515
pol 	 0.7669491525423728


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Классификация текста на основе моделей word2vec

In [None]:
import gensim
from gensim.models import word2vec

In [None]:
import re
from typing import Dict, Tuple
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sveta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Подготовим корпус
corpus = []
stop_words = stopwords.words('english')
tok = WordPunctTokenizer()
for line in text_df['Review'].values:
    line1 = line.strip().lower()
    line1 = re.sub("[^a-zA-Z]"," ", line1)
    text_tok = tok.tokenize(line1)
    text_tok1 = [w for w in text_tok if not w in stop_words]
    corpus.append(text_tok1)

In [None]:
corpus[:10]

[['bloody',
  'birthday',
  'odd',
  'times',
  'humorous',
  'low',
  'budget',
  'horror',
  'flick',
  'along',
  'lines',
  'mikey',
  'less',
  'intelligent',
  'version',
  'good',
  'son',
  'br',
  'br',
  'set',
  'small',
  'californian',
  'town',
  'three',
  'babies',
  'born',
  'height',
  'eclipse',
  'planetary',
  'alignment',
  'means',
  'somehow',
  'born',
  'without',
  'emotions',
  'ten',
  'years',
  'later',
  'three',
  'little',
  'psychopaths',
  'take',
  'killing',
  'spree',
  'away',
  'parents',
  'siblings',
  'teachers',
  'anyone',
  'else',
  'irritates',
  'one',
  'teenage',
  'girl',
  'knows',
  'truth',
  'able',
  'stop',
  'explanation',
  'babies',
  'across',
  'world',
  'born',
  'time',
  'equally',
  'twisted',
  'go',
  'br',
  'br',
  'slasher',
  'film',
  'tame',
  'terms',
  'violence',
  'gore',
  'suppose',
  'highlights',
  'problem',
  'casting',
  'child',
  'characters',
  'killers',
  'much',
  'expose',
  'young',
  'acto

In [None]:
%time
model_imdb = word2vec.Word2Vec(corpus, workers=4, min_count=10, window=10, sample=1e-3)

Wall time: 0 ns


In [None]:
# Проверим, что модель обучилась
print(model_imdb.wv.most_similar(positive=['find'], topn=5))

[('someone', 0.9996957778930664), ('audience', 0.9996758103370667), ('far', 0.9996746182441711), ('everything', 0.9996718168258667), ('nothing', 0.9996710419654846)]


In [None]:
def sentiment(v, c):
    model = Pipeline(
        [("vectorizer", v),
         ("classifier", c)])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_accuracy_score_for_classes(y_test, y_pred)

In [None]:
class EmbeddingVectorizer(object):
    '''
    Для текста усредним вектора входящих в него слов
    '''
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([np.mean(
            [self.model[w] for w in words if w in self.model]
            or [np.zeros(self.size)], axis=0)
            for words in X])

In [None]:
def accuracy_score_for_classes(
    y_true: np.ndarray,
    y_pred: np.ndarray) -> Dict[int, float]:
    """
    Вычисление метрики accuracy для каждого класса
    y_true - истинные значения классов
    y_pred - предсказанные значения классов
    Возвращает словарь: ключ - метка класса,
    значение - Accuracy для данного класса
    """
    # Для удобства фильтрации сформируем Pandas DataFrame
    d = {'t': y_true, 'p': y_pred}
    df = pd.DataFrame(data=d)
    # Метки классов
    classes = np.unique(y_true)
    # Результирующий словарь
    res = dict()
    # Перебор меток классов
    for c in classes:
        # отфильтруем данные, которые соответствуют
        # текущей метке класса в истинных значениях
        temp_data_flt = df[df['t']==c]
        # расчет accuracy для заданной метки класса
        temp_acc = accuracy_score(
            temp_data_flt['t'].values,
            temp_data_flt['p'].values)
        # сохранение результата в словарь
        res[c] = temp_acc
    return res

def print_accuracy_score_for_classes(
    y_true: np.ndarray,
    y_pred: np.ndarray):
    """
    Вывод метрики accuracy для каждого класса
    """
    accs = accuracy_score_for_classes(y_true, y_pred)
    if len(accs)>0:
        print('Метка \t Accuracy')
    for i in accs:
        print('{} \t {}'.format(i, accs[i]))

In [None]:
# Обучающая и тестовая выборки
boundary = 900
X_train = corpus[:boundary]
X_test = corpus[boundary:]
y_train = text_df['Sentiment'][:boundary]
y_test = text_df['Sentiment'][boundary:]

In [None]:
sentiment(EmbeddingVectorizer(model_imdb.wv), LogisticRegression(C=3.0))

Метка 	 Accuracy
otr 	 0.5319148936170213
pol 	 0.5094339622641509


Наибольшая точность получилась при использовании CountVectorizer и LogisticRegression