# Feature Engineering for Text Classification

## Setup

In [1]:
!pip install janome beautifulsoup4

Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
Installing collected packages: janome
Successfully installed janome-0.4.1


## Imports

In [2]:
import string
import pandas as pd
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## The dataset

今回は、データセットとして[Amazon Customer Reviews Dataset](https://s3.amazonaws.com/amazon-reviews-pds/readme.html)を使います。このデータセットは、Amazonの製品レビューにメタデータを付与して作成されたコーパスです。データの中に評価（1-5）が含まれているため、テキスト分類モデルの学習に使うことができます。

今回は以下の設定で使うことにします。

- Number of labels: 2.
- Size of training dataset: 8000.
- Size of evaluation dataset: 2000.
- Language: Japanese

### Load the Amazon Customer Reviews Datasets

In [7]:
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold


def load_dataset(filename, n=5000, state=6):
    df = pd.read_csv(filename, sep='\t')

    # Converts multi-class to binary-class.
    mapping = {1: 0, 2: 0, 4: 1, 5: 1}              # マッピング
    df = df[df.star_rating != 3]                      # 評価３(ポジティブでもネガティブでもない)(のデータは削除
    df.star_rating = df.star_rating.map(mapping)     # マッピングで変換

    # extracts Japanese texts.
    is_jp = df.review_body.apply(filter_by_ascii_rate)
    df = df[is_jp]

    # sampling.
    df = df.sample(frac=1, random_state=state)  # shuffle
    grouped = df.groupby('star_rating')
    df = grouped.head(n=n)
    return df.review_body.values, df.star_rating.values


url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz'
x, y = load_dataset(url, n=5000)

### Preprocess the dataset

In [8]:
t = Tokenizer(wakati=True)


def tokenize(text):
    return t.tokenize(text)


def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text

In [9]:
x = [clean_html(text, strip=True) for text in x]

## Experiments

### Various vectorization methods

In [6]:
def train_and_eval(x_train, y_train, x_test, y_test, vectorizer):
    x_train_vec = vectorizer.fit_transform(x_train)
    x_test_vec = vectorizer.transform(x_test)
    clf = LogisticRegression(solver='liblinear')
    clf.fit(x_train_vec, y_train)
    y_pred = clf.predict(x_test_vec)
    score = accuracy_score(y_test, y_pred)
    print('{:.4f}'.format(score))


print('Tokenization for faster experiments')
x_tokenized = [' '.join(tokenize(text)) for text in x]
x_train, x_test, y_train, y_test = train_test_split(x_tokenized, y, test_size=0.2, random_state=42)

print('Binary')
vectorizer = CountVectorizer(binary=True)
train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

print('Count')
vectorizer = CountVectorizer(binary=False)
train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

print('TF-IDF')
vectorizer = TfidfVectorizer()
train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

print('Bigram')
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

Tokenization for faster experiments
Binary
0.8385
Count
0.8365
TF-IDF
0.8510
Bigram
0.8545


### Feature Selection using SelectKBest

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print('Vectorizing...')
vectorizer = CountVectorizer(tokenizer=tokenize)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
print(x_train.shape)
print(x_test.shape)

print('Selecting features...')
selector = SelectKBest(k=7000, score_func=mutual_info_classif)
# selector = SelectKBest(k=7000)
selector.fit(x_train, y_train)
x_train_new = selector.transform(x_train)
x_test_new = selector.transform(x_test)
print(x_train_new.shape)
print(x_test_new.shape)

print('Evaluating...')
clf = LogisticRegression(solver='liblinear')
clf.fit(x_train_new, y_train)
y_pred = clf.predict(x_test_new)
score = accuracy_score(y_test, y_pred)
print('{:.4f}'.format(score))

Vectorizing...
(8000, 40980)
(2000, 40980)
Selecting features...
(8000, 7000)
(2000, 7000)
Evaluating...
0.8370
