In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/simplesentiment/products_sentiment_sample_submission.csv
/kaggle/input/simplesentiment/products_sentiment_test.tsv
/kaggle/input/simplesentiment/products_sentiment_train.tsv


In [2]:
 # отключим предупреждения
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report

from nltk import word_tokenize  
from nltk.stem.snowball import PorterStemmer

In [3]:
# Загружаем данные
train = pd.read_csv('/kaggle/input/simplesentiment/products_sentiment_train.tsv', sep = '\t', header = None, names = ['text', 'y'])
test = pd.read_csv('/kaggle/input/simplesentiment/products_sentiment_test.tsv', sep = '\t')

In [4]:
print ("Количество размеченных отзывов: %d" % (train.shape[0]))
print ("Количество позитивных отзывов: %d (%0.1f%%)" % (train.y.sum(), 100.*train.y.mean()))
print ("Количество тестовых отзывов: %d" % (test.shape[0]))

Количество размеченных отзывов: 2000
Количество позитивных отзывов: 1274 (63.7%)
Количество тестовых отзывов: 500


In [5]:
# Пример нескольких отзывов
pd.set_option('max_colwidth', 300)
train.head()

Unnamed: 0,text,y
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .,1
2,the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .,1
3,"i dont especially like how music files are unstructured ; basically they are just dumped into one folder with no organization , like you might have in windows explorer folders and subfolders .",0
4,i was using the cheapie pail ... and it worked ok until the opening device fell apart .,1


In [6]:
# Создадим вспомогательных аналайзер на основе стеммера Портера
stemmer = PorterStemmer()
analyzer = TfidfVectorizer().build_analyzer()

def stemmed(text):
    return (stemmer.stem(word) for word in analyzer(preprocess(text)))

In [7]:
# Заменим 't на not
def preprocess(text):
    return text.replace(" 't", " not")

train['x'] = train.text.apply(preprocess)
test['x'] = test.text.apply(preprocess)

In [8]:
# Объединим векторизованные фичи разных типов токенов
union = FeatureUnion([("word11", TfidfVectorizer(ngram_range=(1,1), analyzer='word')),
                      ("stem11", TfidfVectorizer(ngram_range=(1,1), analyzer=stemmed)),
                      ("word23", TfidfVectorizer(ngram_range=(2,3), analyzer='word')),
                      ("stem23", TfidfVectorizer(ngram_range=(2,3), analyzer=stemmed)),
                      ("char14", TfidfVectorizer(ngram_range=(1,4), analyzer='char'))])

# Объединим в Pipeline с линейной регрессией в качестве классификатора
pipe = Pipeline([("vectorizer", union),
                 ("classifier", LogisticRegression(penalty = 'l2'))])

# Расчитаем точность по кроссвалидации
scores = cross_val_score(pipe, train.x, train.y, cv = 5)

print ("Средняя точность: %0.2f%%" % (100.*scores.mean()))
print ("Среднеквадратичное отклонение: %0.4f" % scores.std())

Средняя точность: 79.25%
Среднеквадратичное отклонение: 0.0078


In [9]:
# Посмотрим на ошибки
X_train, X_test, y_train, y_test = train_test_split(train.x, train.y, test_size=0.2, random_state=0)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
p_test = pipe.predict_proba(X_test)
check = pd.DataFrame(X_test)
check['y'] = y_test
check['y_pred'] = y_pred
check['p0'] = p_test[:,0]
check['p1'] = p_test[:,1]

#check.head()
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.59      0.66       149
           1       0.79      0.89      0.83       251

    accuracy                           0.78       400
   macro avg       0.77      0.74      0.75       400
weighted avg       0.78      0.78      0.77       400



In [10]:
# Обучим классификатор на всех размененных данных
pipe.fit(train.x, train.y)
test['y'] = pipe.predict(test.x)

# Запишем в файл решение для загрузки на Kaggle
test[['Id','y']].to_csv('product-reviews-sentiment-analysis-light.csv', index = False)


In [11]:
# Проверим, что записалось корректно
! head -5 product-reviews-sentiment-analysis-light.csv

Id,y
0,1
1,0
2,1
3,1
