## Данные

In [1]:
import pandas as pd

In [2]:
data = pd.read_json('train.json', encoding='utf-8')

In [3]:
len(data)

8263

In [4]:
data.head()

Unnamed: 0,id,sentiment,text
0,1945,negative,Досудебное расследование по факту покупки ЕНПФ...
1,1957,negative,Медики рассказали о состоянии пострадавшего му...
2,1969,negative,"Прошел почти год, как железнодорожным оператор..."
3,1973,negative,По итогам 12 месяцев 2016 года на территории р...
4,1975,negative,Астана. 21 ноября. Kazakhstan Today - Агентств...


In [5]:
set(data.sentiment)

{'negative', 'neutral', 'positive'}

## Классификация

a) Naive Bayes, CountVectorizer, без предобработки 

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [8]:
vec = CountVectorizer()
bow = vec.fit_transform(data.text)
X_train, X_test, y_train, y_test = train_test_split(bow, data.sentiment)

In [9]:
nb = MultinomialNB()
clf = nb.fit(X_train, y_train)

In [11]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.65      0.74      0.69       369
     neutral       0.73      0.62      0.67       979
    positive       0.67      0.77      0.72       718

    accuracy                           0.69      2066
   macro avg       0.68      0.71      0.69      2066
weighted avg       0.70      0.69      0.69      2066



b) -//-, с лемматизацией (pymorphy) и правильной токенизацией

In [12]:
from nltk import word_tokenize
import pymorphy2
morph_analyzer = pymorphy2.MorphAnalyzer()

def lemmatize(text):
    words = word_tokenize(text)
    lemmas = [morph_analyzer.parse(word)[0].normal_form for word in words] 
    return ' '.join(lemmas)

In [14]:
data['text_lemmatized'] = data['text'].apply(lemmatize)

In [15]:
bow1 = vec.fit_transform(data.text_lemmatized)
X_train, X_test, y_train, y_test = train_test_split(bow1, data.sentiment)

clf1 = nb.fit(X_train, y_train)

print(classification_report(y_test, clf1.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.57      0.75      0.65       347
     neutral       0.76      0.57      0.65      1032
    positive       0.63      0.77      0.69       687

    accuracy                           0.67      2066
   macro avg       0.65      0.70      0.66      2066
weighted avg       0.69      0.67      0.67      2066



Результат хуже

с) -//-, без пунктуации

In [17]:
from nltk.corpus import stopwords
from string import punctuation

In [24]:
noise = list(punctuation)

vec1 = CountVectorizer(stop_words=noise)
bow2 = vec1.fit_transform(data.text)
X_train, X_test, y_train, y_test = train_test_split(bow2, data.sentiment)

clf2 = nb.fit(X_train, y_train)

print(classification_report(y_test, clf2.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.65      0.72      0.68       369
     neutral       0.74      0.63      0.68       978
    positive       0.68      0.78      0.72       719

    accuracy                           0.70      2066
   macro avg       0.69      0.71      0.70      2066
weighted avg       0.70      0.70      0.70      2066



Результат немного улучшился

d) -//-, без пунктуации и стоп-слов

In [26]:
noise1 = stopwords.words('russian') + list(punctuation)

vec2 = CountVectorizer(stop_words=noise1)
bow3 = vec2.fit_transform(data.text)
X_train, X_test, y_train, y_test = train_test_split(bow3, data.sentiment)

clf3 = nb.fit(X_train, y_train)

print(classification_report(y_test, clf3.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.66      0.73      0.69       363
     neutral       0.76      0.63      0.69       997
    positive       0.67      0.79      0.73       706

    accuracy                           0.70      2066
   macro avg       0.70      0.72      0.70      2066
weighted avg       0.71      0.70      0.70      2066



Результат почти не изменился

е) -//-, без пунткуации, стоп-слов, латиницы

In [28]:
noise2 = list(punctuation) + stopwords.words('russian') + list('abcdefghijklmnopqrstuvwxyz')

vec3 = CountVectorizer(stop_words=noise2)
bow4 = vec3.fit_transform(data.text)
X_train, X_test, y_train, y_test = train_test_split(bow4, data.sentiment)

clf4 = nb.fit(X_train, y_train)

print(classification_report(y_test, clf4.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.64      0.70      0.67       365
     neutral       0.75      0.61      0.68      1005
    positive       0.65      0.79      0.72       696

    accuracy                           0.69      2066
   macro avg       0.68      0.70      0.69      2066
weighted avg       0.70      0.69      0.69      2066



Результат чуть хуже

f) LogReg, CountVectorizer, без пунктуации и стоп-слов

In [20]:
from sklearn.linear_model import LogisticRegression

In [29]:
X_train, X_test, y_train, y_test = train_test_split(bow3, data.sentiment)

lr = LogisticRegression()
clf5 = lr.fit(X_train, y_train)

print(classification_report(y_test, clf5.predict(X_test)))



              precision    recall  f1-score   support

    negative       0.70      0.58      0.63       383
     neutral       0.69      0.71      0.70      1014
    positive       0.67      0.71      0.69       669

    accuracy                           0.68      2066
   macro avg       0.69      0.67      0.67      2066
weighted avg       0.69      0.68      0.68      2066





Дефолтный LogReg работает хуже Наивного Байеса, попробуем подобрать гиперпараметры с помощью GridSearch

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf5, param_grid = grid_values)
grid_clf_acc.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.009, 0.01, 0.09, 1, 5, 10, 25],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

Дефолтные гиперпараметры соответствуют оптимальным

g) Naive Bayes, TF-IDF Vectorizer, без пунктуации и стоп-слов

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vec4 = TfidfVectorizer(min_df=1, max_df=0.99, stop_words=noise1)
bow5 = vec4.fit_transform(data.text)
X_train, X_test, y_train, y_test = train_test_split(bow5, data.sentiment)

clf6 = nb.fit(X_train, y_train)

print(classification_report(y_test, clf6.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.90      0.08      0.14       358
     neutral       0.57      0.96      0.72      1047
    positive       0.82      0.34      0.48       661

    accuracy                           0.61      2066
   macro avg       0.76      0.46      0.45      2066
weighted avg       0.71      0.61      0.54      2066



Результат хуже

h) kNN, CountVectorizer, без пунктуации и стоп-слов

In [35]:
from sklearn.neighbors import KNeighborsClassifier

In [36]:
X_train, X_test, y_train, y_test = train_test_split(bow3, data.sentiment)

knn = KNeighborsClassifier()
clf7 = knn.fit(X_train, y_train)
print(classification_report(y_test, clf7.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.45      0.52      0.48       367
     neutral       0.60      0.75      0.67       999
    positive       0.68      0.37      0.48       700

    accuracy                           0.58      2066
   macro avg       0.57      0.55      0.54      2066
weighted avg       0.60      0.58      0.57      2066



Результат хуже

i) DecisionTree, -//-

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
decision_tree = DecisionTreeClassifier()
clf8 = decision_tree.fit(X_train, y_train)
print(classification_report(y_test, clf8.predict(X_test)))

              precision    recall  f1-score   support

    negative       0.51      0.49      0.50       367
     neutral       0.61      0.64      0.62       999
    positive       0.58      0.54      0.56       700

    accuracy                           0.58      2066
   macro avg       0.56      0.56      0.56      2066
weighted avg       0.58      0.58      0.58      2066



Результат снова хуже

**Лучший результат: F-мера = 70**

**Модель (clf3): Naive Bayes, CountVectorizer, без пунктуации и стоп-слов**