# Предсказание оценки по отзыву

In [10]:
import pandas as pd
from pandas.io.json._json import JsonReader

json_df = pd.read_json('data/review.json', orient='records')

json_df

Unnamed: 0,reviewText,overall
0,This app forces you to use the old invocation ...,1
1,"This skill works just fine, though I think a l...",4
2,So far I have no issues connecting LIFX bulbs ...,3
3,When giving instructions Alexa asks which ligh...,1
4,When trying to link an account a line of code ...,1
...,...,...
70487,This is a very brown based lipstick.,4
70488,"Meh, not ugly and definitely not cute color",2
70489,beautiful nude shade,5
70490,It is a lovely color. BUT it is something that...,4


In [11]:
import string
import inflect
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer


def text_lowercase(text):
    return text.lower()


p = inflect.engine()


def convert_number(text):
    temp_str = text.split()
    new_string = []

    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        else:
            new_string.append(word)

    temp_str = ' '.join(new_string)
    return temp_str


def remove_punctuation(text: str):
    for punc in string.punctuation:
        text = text.replace(punc, ' ')
    return text


def remove_whitespace(text):
    return ' '.join(text.split())


def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)


stemmer = PorterStemmer()


def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)


lemmatizer = WordNetLemmatizer()


def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_tokens]
    return ' '.join(lemmas)


pipeline = [text_lowercase, remove_punctuation, convert_number,
            remove_whitespace, remove_stopwords, stem_words,
            lemmatize_word]


def preproc(text):
    copy_text = text
    for func in pipeline:
        copy_text = func(copy_text)
    return copy_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nitcu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nitcu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
preproc('Its me, Mario 15!')

'mario fifteen'

In [16]:
from tqdm import tqdm


def get_dataset(df: pd.DataFrame):
    df.dropna(inplace=True)
    df = df.loc[:, ['reviewText', 'overall']]
    df['reviewText'] = df['reviewText'].apply(preproc)
    df['overall'] = df['overall'].astype(int)
    return df.reviewText.values, df.overall.values

In [18]:
data, target = get_dataset(json_df)
del json_df
data[0], target[0]

('app forc use old invoc phrase tell lifx know review say refer new skill optim smart home one unfortun one two skill perform account link current stick cumbersom old phrase get new skill work',
 1)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [20]:
from sklearn import svm
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.LinearSVC(C=1.0)),
])

text_clf.fit(X_train, y_train)

In [21]:
import numpy as np

predicted = text_clf.predict(X_test)

print(f'Accuracy: {round(float(np.mean(predicted == y_test)), 3) * 100}%')

Accuracy: 69.3%
