# read data

In [None]:
!kaggle datasets download -d nicapotato/womens-ecommerce-clothing-reviews

In [2]:
!unzip -q womens-ecommerce-clothing-reviews.zip
!rm womens-ecommerce-clothing-reviews.zip

 # データの前処理

以下のフローに沿ってモデルの精度を算出および評価をしましょう．

1. Countvectorizerライブラリを使用して，reviews_preparedを変換しましょう．
2. 分類器を作成（モデルは何でもいい）
3. 分類器の評価

『目的』
女性服のネット通販のレビュー分析を行います
そのために
カウントベースの手法を用いて、あるレビューが良いレビューか悪いレビューかを判断するモデルを作成します

Countvectorizerを用います
単語数を数えるのに使います

In [27]:
# py3
import nltk

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from pandas import read_csv
import collections
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amoeba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
dataset = read_csv('Womens Clothing E-Commerce Reviews.csv')
reviews = dataset['Review Text'].astype('str')
recommend = dataset['Recommended IND']

In [20]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [22]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
Unnamed: 0                 23486 non-null int64
Clothing ID                23486 non-null int64
Age                        23486 non-null int64
Title                      19676 non-null object
Review Text                22641 non-null object
Rating                     23486 non-null int64
Recommended IND            23486 non-null int64
Positive Feedback Count    23486 non-null int64
Division Name              23472 non-null object
Department Name            23472 non-null object
Class Name                 23472 non-null object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [43]:
dataset.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

欠損値あり

In [49]:
# 簡単に処理する
clear_dataset = dataset.dropna()

In [50]:
clear_dataset.isnull().sum()

Unnamed: 0                 0
Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

In [53]:
clear_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19662 entries, 2 to 23485
Data columns (total 11 columns):
Unnamed: 0                 19662 non-null int64
Clothing ID                19662 non-null int64
Age                        19662 non-null int64
Title                      19662 non-null object
Review Text                19662 non-null object
Rating                     19662 non-null int64
Recommended IND            19662 non-null int64
Positive Feedback Count    19662 non-null int64
Division Name              19662 non-null object
Department Name            19662 non-null object
Class Name                 19662 non-null object
dtypes: int64(6), object(5)
memory usage: 1.8+ MB


In [54]:
clear_dataset.describe()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,19662.0,19662.0,19662.0,19662.0,19662.0,19662.0
mean,11755.260655,921.297274,43.260808,4.183145,0.818177,2.652477
std,6772.063092,200.227528,12.258122,1.112224,0.385708,5.834285
min,2.0,1.0,18.0,1.0,0.0,0.0
25%,5888.25,861.0,34.0,4.0,1.0,0.0
50%,11749.5,936.0,41.0,5.0,1.0,1.0
75%,17624.75,1078.0,52.0,5.0,1.0,3.0
max,23485.0,1205.0,99.0,5.0,1.0,122.0


In [29]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def review_prepare(review):
    review = review.lower()# lowercase text
    review = re.sub(REPLACE_BY_SPACE_RE," ",review)# replace REPLACE_BY_SPACE_RE symbols by space in text
    review = re.sub(BAD_SYMBOLS_RE,"",review)# delete symbols which are in BAD_SYMBOLS_RE from text
    review = re.sub(' +',' ',review)
    review = " ".join([word for word in review.split() if word not in STOPWORDS]) # delete stopwords from text
    return review

reviews_prepared = [review_prepare(review) for review in reviews]

In [63]:
# reviews_prepared

# countvectorizerを使う
https://qiita.com/nazoking@github/items/033ca2d0f4c63ebc165e

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(reviews_prepared)

In [56]:
X_train_counts.shape

(23486, 19345)

In [59]:
count_vect.vocabulary_.get(u'algorithm')

In [60]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [61]:
X_train_tf.shape

(23486, 19345)

In [62]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(23486, 19345)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tfidf, twenty_train.target)

9/22 18:50
進捗としてメモ。  
まずtrainとtestデータに分ける必要があった。