# Предсказание оценки по отзыву

In [13]:
import pandas as pd
from pandas.io.json._json import JsonReader

json_df = pd.read_json('data/All_Amazon_Review_5.json', lines=True, chunksize=5)

next(json_df)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote
0,2,False,"12 5, 2015",A3KUPJ396OQF78,B017O9P72A,Larry Russlin,Can only control one of two bulbs from one of ...,Buggy,1449273600,
1,5,False,"01 15, 2018",A3TXR8GLKS19RE,B017O9P72A,Nello,Great skill,Great,1515974400,
2,1,False,"01 4, 2018",A1FOHYK23FJ6CN,B017O9P72A,L. Ray Humphreys,Not happy. Can not connect to Alexa regardless.,Can not connect to ECHO,1515024000,2.0
3,1,False,"12 30, 2017",A1RRDX9AOST1AN,B017O9P72A,Viola,Can not connect a hue lights to Alexa. Linked ...,Connecting is a no go,1514592000,5.0
4,1,False,"12 29, 2017",AA4DHYT5YSSIT,B017O9P72A,angie anj,"The service works with google home, but doesn'...",Does not work,1514505600,5.0


In [2]:
import string
import inflect
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


def text_lowercase(text):
    return text.lower()


p = inflect.engine()


def convert_number(text):
    temp_str = text.split()
    new_string = []

    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        else:
            new_string.append(word)

    temp_str = ' '.join(new_string)
    return temp_str


def remove_punctuation(text: str):
    for punc in string.punctuation:
        text = text.replace(punc, ' ')
    return text


def remove_whitespace(text):
    return ' '.join(text.split())


def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)


stemmer = PorterStemmer()


def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)


lemmatizer = WordNetLemmatizer()


def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_tokens]
    return ' '.join(lemmas)


pipeline = [text_lowercase, remove_punctuation, convert_number,
            remove_whitespace, remove_stopwords, stem_words,
            lemmatize_word]


def preproc(text):
    copy_text = text
    for func in pipeline:
        copy_text = func(copy_text)
    return copy_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nitcu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nitcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
preproc('Its me, Mario 15!')

'mario fifteen'

In [4]:
from tqdm import tqdm


def get_dataset(df: JsonReader, size: int):
    dataset = pd.DataFrame(data=[], columns=['reviewText', 'overall'])
    pbar = tqdm(total=size)
    for i, chunk in enumerate(df):
        chunk.dropna(inplace=True)
        if i == size:
            break


        chunk['reviewText'] = chunk['reviewText'].apply(preproc)
        dataset = pd.concat([dataset, chunk[['reviewText', 'overall']]])
        pbar.update(1)

    dataset['overall'] = dataset['overall'].astype(int)
    return dataset.reviewText.values, dataset.overall.values

In [5]:
data, target = get_dataset(json_df, 20000)
del json_df
data[0], target[0]

100%|██████████| 20000/20000 [03:12<00:00, 103.67it/s]


('app forc use old invoc phrase tell lifx know review say refer new skill optim smart home one unfortun one two skill perform account link current stick cumbersom old phrase get new skill work',
 1)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [7]:
from sklearn import svm
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.LinearSVC(C=1.0)),
])

text_clf.fit(X_train, y_train)

In [12]:
import numpy as np

predicted = text_clf.predict(X_test)

print(f'Accuracy: {round(float(np.mean(predicted == y_test)), 3) * 100}%')

Accuracy: 69.3%
