In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [None]:
# Загрузка данных
df = pd.read_csv('reviews.csv', delimiter='\t')

df = df[df['sentiment'] != 'neautral']

# Разделим данные на обучающую и тестовую выборки

X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df.head()

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative


In [None]:
# У unigram
vectorizer_uni = CountVectorizer(ngram_range=(1, 1))
X_train_uni = vectorizer_uni.fit_transform(X_train)
X_test_uni = vectorizer_uni.transform(X_test)

# У bigram
vectorizer_bi = CountVectorizer(ngram_range=(2, 2))
X_train_bi = vectorizer_bi.fit_transform(X_train)
X_test_bi = vectorizer_bi.transform(X_test)

# У комбинации unigram и bigram
vectorizer_comb = CountVectorizer(ngram_range=(1, 2))
X_train_comb = vectorizer_comb.fit_transform(X_train)
X_test_comb = vectorizer_comb.transform(X_test)

# У N-грамм на буквах (например, 3-граммы)
vectorizer_char3 = CountVectorizer(analyzer='char', ngram_range=(3, 3))
X_train_char3 = vectorizer_char3.fit_transform(X_train)
X_test_char3 = vectorizer_char3.transform(X_test)


In [None]:
# У N-грамм на буквах (например, 4-граммы)
vectorizer_char4 = CountVectorizer(analyzer='char', ngram_range=(4, 4))
X_train_char4 = vectorizer_char4.fit_transform(X_train)
X_test_char4 = vectorizer_char4.transform(X_test)

# У N-грамм на буквах (например, 5-граммы)
vectorizer_char5 = CountVectorizer(analyzer='char', ngram_range=(5, 5))
X_train_char5 = vectorizer_char5.fit_transform(X_train)
X_test_char5 = vectorizer_char5.transform(X_test)

# У N-грамм на буквах (например, 3-граммы и 4-граммы)
vectorizer_char34 = CountVectorizer(analyzer='char', ngram_range=(3, 4))
X_train_char34 = vectorizer_char34.fit_transform(X_train)
X_test_char34 = vectorizer_char34.transform(X_test)

# У N-грамм на буквах (например, 4-граммы и 5-граммы)
vectorizer_char45 = CountVectorizer(analyzer='char', ngram_range=(4, 5))
X_train_char45 = vectorizer_char45.fit_transform(X_train)
X_test_char45 = vectorizer_char45.transform(X_test)

# У N-грамм на буквах (например, 3-граммы, 4-граммы и 5-граммы)
vectorizer_char345 = CountVectorizer(analyzer='char', ngram_range=(3, 5))
X_train_char345 = vectorizer_char345.fit_transform(X_train)
X_test_char345 = vectorizer_char345.transform(X_test)

In [None]:
# Функция для обучения и тестирования модели
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))

# Наивный байес
nb_model = MultinomialNB()

# Метод опорных векторов
svm_model = SVC()

In [None]:
# Наивный байес
print("Naive Bayes with Unigrams:")
train_and_evaluate(nb_model, X_train_uni, X_test_uni, y_train, y_test)

print("Naive Bayes with Bigrams:")
train_and_evaluate(nb_model, X_train_bi, X_test_bi, y_train, y_test)

print("Naive Bayes with Combinations:")
train_and_evaluate(nb_model, X_train_comb, X_test_comb, y_train, y_test)

print("Naive Bayes with Char 3-grams:")
train_and_evaluate(nb_model, X_train_char3, X_test_char3, y_train, y_test)

print("Naive Bayes with Char 4-grams:")
train_and_evaluate(nb_model, X_train_char4, X_test_char4, y_train, y_test)

print("Naive Bayes with Char 5-grams:")
train_and_evaluate(nb_model, X_train_char5, X_test_char5, y_train, y_test)

print("Naive Bayes with Char 3-4-grams:")
train_and_evaluate(nb_model, X_train_char34, X_test_char34, y_train, y_test)

print("Naive Bayes with Char 4-5-grams:")
train_and_evaluate(nb_model, X_train_char45, X_test_char45, y_train, y_test)

print("Naive Bayes with Char 3-4-5-grams:")
train_and_evaluate(nb_model, X_train_char345, X_test_char345, y_train, y_test)

# Метод опорных векторов
print("SVM with Unigrams (Binary Weights):")
train_and_evaluate(svm_model, X_train_uni, X_test_uni, y_train, y_test)

print("SVM with Bigrams (Binary Weights):")
train_and_evaluate(svm_model, X_train_bi, X_test_bi, y_train, y_test)

print("SVM with Combinations (Binary Weights):")
train_and_evaluate(svm_model, X_train_comb, X_test_comb, y_train, y_test)

print("SVM with Char 3-grams (Binary Weights):")
train_and_evaluate(svm_model, X_train_char3, X_test_char3, y_train, y_test)

print("SVM with Char 4-grams (Binary Weights):")
train_and_evaluate(svm_model, X_train_char4, X_test_char4, y_train, y_test)

print("SVM with Char 5-grams (Binary Weights):")
train_and_evaluate(svm_model, X_train_char5, X_test_char5, y_train, y_test)

print("SVM with Char 3-4-grams (Binary Weights):")
train_and_evaluate(svm_model, X_train_char34, X_test_char34, y_train, y_test)

print("SVM with Char 4-5-grams (Binary Weights):")
train_and_evaluate(svm_model, X_train_char45, X_test_char45, y_train, y_test)

print("SVM with Char 3-4-5-grams (Binary Weights):")
train_and_evaluate(svm_model, X_train_char345, X_test_char345, y_train, y_test)


Naive Bayes with Unigrams:
              precision    recall  f1-score   support

    negative       0.93      0.89      0.91      5909
    positive       0.90      0.93      0.91      6091

    accuracy                           0.91     12000
   macro avg       0.91      0.91      0.91     12000
weighted avg       0.91      0.91      0.91     12000

Accuracy: 0.9100833333333334
Naive Bayes with Bigrams:
              precision    recall  f1-score   support

    negative       0.90      0.92      0.91      5909
    positive       0.92      0.90      0.91      6091

    accuracy                           0.91     12000
   macro avg       0.91      0.91      0.91     12000
weighted avg       0.91      0.91      0.91     12000

Accuracy: 0.913
Naive Bayes with Combinations:
              precision    recall  f1-score   support

    negative       0.93      0.92      0.93      5909
    positive       0.93      0.93      0.93      6091

    accuracy                           0.93     12000

In [None]:
# TF-IDF для SVM

vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 1))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Unigrams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Unigrams:
              precision    recall  f1-score   support

    negative       0.92      0.95      0.94      5909
    positive       0.95      0.92      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.9354166666666667


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(ngram_range=(2, 2))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Bigrams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Bigrams:
              precision    recall  f1-score   support

    negative       0.92      0.92      0.92      5909
    positive       0.92      0.92      0.92      6091

    accuracy                           0.92     12000
   macro avg       0.92      0.92      0.92     12000
weighted avg       0.92      0.92      0.92     12000

Accuracy: 0.9229166666666667


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Combinations:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Combinations:
              precision    recall  f1-score   support

    negative       0.92      0.96      0.94      5909
    positive       0.96      0.92      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.94


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Char 3-grams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Char 3-grams:
              precision    recall  f1-score   support

    negative       0.92      0.95      0.94      5909
    positive       0.95      0.92      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.93575


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(4, 4))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Char 4-grams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Char 4-grams:
              precision    recall  f1-score   support

    negative       0.93      0.95      0.94      5909
    positive       0.95      0.93      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.9394166666666667


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(5, 5))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Char 5-grams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Char 5-grams:
              precision    recall  f1-score   support

    negative       0.93      0.96      0.94      5909
    positive       0.96      0.93      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.9419166666666666


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 4))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Char 3-4-grams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Char 3-4-grams:
              precision    recall  f1-score   support

    negative       0.93      0.95      0.94      5909
    positive       0.95      0.93      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.9395833333333333


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(4, 5))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Char 4-5-grams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Char 4-5-grams:
              precision    recall  f1-score   support

    negative       0.93      0.96      0.94      5909
    positive       0.96      0.93      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.9415


In [None]:
# TF-IDF для SVM
vectorizer_tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3, 5))
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

print("SVM with TF-IDF Char 3-4-5-grams:")
train_and_evaluate(svm_model, X_train_tfidf, X_test_tfidf, y_train, y_test)

SVM with TF-IDF Char 3-4-5-grams:
              precision    recall  f1-score   support

    negative       0.93      0.96      0.94      5909
    positive       0.96      0.93      0.94      6091

    accuracy                           0.94     12000
   macro avg       0.94      0.94      0.94     12000
weighted avg       0.94      0.94      0.94     12000

Accuracy: 0.9426666666666667
