# Preprocessing Text Data

Amazonのレビューを用いて、レビューテキストから星を導出するモデルを作成するサンプル

## Setup

In [None]:
!pip install janome beautifulsoup4

Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
Installing collected packages: janome
Successfully installed janome-0.4.1


### Imports

In [None]:
import pandas as pd
import re
import string

from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## The dataset

### Load the Amazon Customer Reviews Datasets

In [None]:
# アルファベットの割合が閾値を下回ったらTrue=日本語レビューと判断するためのメソッド
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold


def load_dataset(filename, n=5000, state=6):
    df = pd.read_csv(filename, sep='\t')

    # extracts Japanese texts.
    is_jp = df.review_body.apply(filter_by_ascii_rate)
    df = df[is_jp]

    # sampling.
    df = df.sample(frac=1, random_state=state)  # shuffle
    grouped = df.groupby('star_rating') # 各ラベルでグルーピング
    df = grouped.head(n=n) # 各グループから、均等にn件抽出して用いる
    return df.review_body.values, df.star_rating.values


url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz'
x, y = load_dataset(url, n=1000)

### Preprocess the dataset

In [None]:
t = Tokenizer()


def tokenize(text):
    return t.tokenize(text, wakati=True)


def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text


def tokenize_base_form(text):
    tokens = [token.base_form for token in t.tokenize(text)]
    return tokens


def normalize_number(text, reduce=False):
    if reduce:
        normalized_text = re.sub(r'\d+', '0', text)
    else:
        normalized_text = re.sub(r'\d', '0', text)
    return normalized_text


def truncate(sequence, maxlen):
    return sequence[:maxlen]


def remove_url(html):
    soup = BeautifulSoup(html, 'html.parser')
    for a in soup.findAll('a'):
        a.replaceWithChildren()
    return str(soup)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## The models

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Train and evaluate the models

In [None]:
def train_and_eval(x_train, y_train, x_test, y_test,
                   lowercase=False, tokenize=None, preprocessor=None):
    vectorizer = CountVectorizer(lowercase=lowercase,
                                 tokenizer=tokenize,
                                 preprocessor=preprocessor)
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    clf = LogisticRegression(solver='liblinear')
    clf.fit(x_train_vec, y_train)
    y_pred = clf.predict(x_test_vec)
    score = accuracy_score(y_test, y_pred)
    print('{:.4f}'.format(score))

#### Tokenization only

In [None]:
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize)

0.4020




#### Clean HTML

In [None]:
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=clean_html)

0.4090




#### Normalize number

In [None]:
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, preprocessor=normalize_number)

0.3940


#### Base form

In [None]:
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize_base_form)

0.3930


#### Lower text

In [None]:
train_and_eval(x_train, y_train, x_test, y_test, tokenize=tokenize, lowercase=True)

0.3980


