In [1]:
import sys

sys.path.append('../../')

In [10]:
from sklearn.model_selection import train_test_split
from preprocessing import clean_html, normalize_number, tokenize, tokenize_base_form
from utils import load_dataset, train_and_eval
import pandas as pd
import string

import warnings
warnings.filterwarnings('ignore')

## モジュール

In [None]:
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold

## データロード

In [None]:
filename = '../data/amazon_reviews_multilingual_JP_v1_00.tsv'
df = pd.read_csv(filename, sep='\t')
df.head(3)

In [None]:
# extracts Japanese texts.
is_jp = df.review_body.apply(filter_by_ascii_rate)
df = df[is_jp]
df.head(3)

In [None]:
# sampling.
random_state = 42
n = 1000

df = df.sample(frac=1, random_state=random_state)  # shuffle
grouped = df.groupby('star_rating')
df = grouped.head(n=n)
df.head(3)

In [None]:
df['star_rating'].value_counts()

In [3]:
x, y = load_dataset('../data/amazon_reviews_multilingual_JP_v1_00.tsv', n=1000)

## データ分割

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## 学習と評価

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# 出現する単語のカウントを特徴量とする
vectorizer = CountVectorizer(lowercase=None, tokenizer=None, preprocessor=None)
x_train_vec = vectorizer.fit_transform(x_train)

In [None]:
pd.DataFrame(x_train_vec.toarray(), columns=vectorizer.get_feature_names())

In [5]:
print('Tokenization only.')
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize)

Tokenization only.
0.4010


In [11]:
print('Clean html.')
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=clean_html)

Clean html.
0.4020


In [7]:
print('Normalize number.')
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=normalize_number)

Normalize number.
0.3940


In [8]:
print('Base form.')
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form)

Base form.
0.3930


In [9]:
print('Lower text.')
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, lowercase=True)

Lower text.
0.3960
