In [None]:
### 作業目的: 使用樹型模型進行文章分類

本次作業主利用[Amazon Review data中的All Beauty](https://nijianmo.github.io/amazon/index.html)來進行review評價分類(文章分類)

資料中將review分為1,2,3,4,5分，而在這份作業，我們將評論改分為差評價、普通評價、優良評價(1,2-->1差評、3-->2普通評價、4,5-->3優良評價)

In [1]:
%load_ext autotime

time: 0 ns


### 載入套件

In [2]:
import json
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

time: 6.16 s


### 資料前處理
文本資料較為龐大，這裡我們取前10000筆資料來進行作業練習

In [17]:
#load json data
all_reviews = []
cnt = 0
with open('All_Beauty.json', mode='r') as f:
    for s in f.readlines():
        all_reviews.append(json.loads(s))
        cnt += 1
        if cnt == 10000:
            break
        
all_reviews[0]

{'overall': 1.0,
 'verified': True,
 'reviewTime': '02 19, 2015',
 'reviewerID': 'A1V6B6TNIC10QE',
 'asin': '0143026860',
 'reviewerName': 'theodore j bigham',
 'reviewText': 'great',
 'summary': 'One Star',
 'unixReviewTime': 1424304000}

time: 922 ms


In [48]:
#parse label(overall) and corpus(reviewText)
#transform labels: 1,2 --> 1 and 3 --> 2 and 4,5 --> 3

corpus = [i.get('reviewText') for i in all_reviews]

change_rank = lambda s: 1.0 if s <= 2.0 else 3.0 if s >= 4.0 else 2.0
labels = [change_rank(s.get('overall')) for s in all_reviews]


time: 0 ns


In [90]:
#preprocessing data
pattern_punc = r"[,\.:]"
pattern_email = r"[\d\w]+@[\d\w]+"
pattern_newline = r"\n"

for i in range(len(corpus)):
    if corpus[i]:
        corpus[i] = re.sub(pattern_punc,' ',corpus[i])
        corpus[i] = re.sub(pattern_email,' ',corpus[i])
        corpus[i] = re.sub(pattern_newline,' ',corpus[i])
    else:
        corpus[i] = ' '



time: 156 ms


In [117]:
#split corpus and label into train and test
x_train, x_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2, shuffle=True, random_state=1501)

len(x_train), len(x_test), len(y_train), len(y_test)

(8000, 2000, 8000, 2000)

time: 0 ns


In [118]:
#change corpus into vector
#you can use tfidf or BoW here

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_v = TfidfVectorizer()
tfidf_v.fit(corpus)

#transform training and testing corpus into vector form
x_train = tfidf_v.transform(x_train)
x_test = tfidf_v.transform(x_test)

time: 531 ms


### 訓練與預測

In [119]:
#build classification model (decision tree, random forest, or adaboost)
#start training

adatree = AdaBoostClassifier()
adatree.fit(x_train, y_train)

AdaBoostClassifier()

time: 4.27 s


In [120]:
# 
print(f"訓練集的正確率: {adatree.score(x_train, y_train)}")
print(f"訓練集的confusion_matrix:\n {confusion_matrix(y_train, adatree.predict(x_train))}")
print(f"訓練集的分類報告:\n {classification_report(y_train, adatree.predict(x_train))}")

訓練集的正確率: 0.909625
訓練集的confusion_matrix:
 [[ 214   10  308]
 [  22   27  258]
 [  87   38 7036]]
訓練集的分類報告:
               precision    recall  f1-score   support

         1.0       0.66      0.40      0.50       532
         2.0       0.36      0.09      0.14       307
         3.0       0.93      0.98      0.95      7161

    accuracy                           0.91      8000
   macro avg       0.65      0.49      0.53      8000
weighted avg       0.89      0.91      0.89      8000

time: 297 ms


In [121]:
#start inference
y_pred = adatree.predict(x_test)

time: 47 ms


In [122]:
print(f"測試集的正確率: {adatree.score(x_test, y_test)}")
print(f"測試集的confusion_matrix:\n {confusion_matrix(y_test, y_pred)}")
print(f"測試集的分類報告:\n {classification_report(y_test, y_pred)}")

測試集的正確率: 0.9015
測試集的confusion_matrix:
 [[  53    2   98]
 [   8    8   57]
 [  21   11 1742]]
測試集的分類報告:
               precision    recall  f1-score   support

         1.0       0.65      0.35      0.45       153
         2.0       0.38      0.11      0.17        73
         3.0       0.92      0.98      0.95      1774

    accuracy                           0.90      2000
   macro avg       0.65      0.48      0.52      2000
weighted avg       0.88      0.90      0.88      2000

time: 47 ms


由上述資訊可以發現, 模型在好評的準確度高(precision, recall都高), 而在差評的部分表現較不理想, 在普通評價的部分大部分跟差評搞混,
同學可以試著學習到的各種方法來提升模型的表現