# ラクマの分類タスク


## インポート


In [8]:
import pandas as pd
from collections import Counter
import string
import jctconv
import emoji
import re
import os
from pathlib import Path

BASE_DIR = Path().resolve().parent
os.chdir(BASE_DIR)


## データチェック


In [10]:
df = pd.read_csv("data/rakuma.csv")


In [26]:
ctg_df = df.category.value_counts().reset_index()  # reset_indexでデータフレーム形式になる
ctg_df = ctg_df.rename(columns={"index": "category_name", "category": "num"})


In [32]:
df_ctg_1000 = ctg_df[ctg_df.num > 1000]


In [33]:
df_ctg_1000.num.sum()


63006

In [44]:
data = df.merge(
    df_ctg_1000[["category_name"]], left_on="category", right_on="category_name"
)


In [45]:
data.category.nunique()


22

## テキストの前処理


In [65]:
emojis = "".join(emoji.EMOJI_DATA.keys())


In [84]:
puncs = string.punctuation + "◆▼★②●☆■★【】『』「」、♪"


In [85]:
def han2zen(txt):
    txt = jctconv.h2z(txt, kana=True, digit=False, ascii=False)
    return jctconv.z2h(txt, kana=False, digit=True, ascii=True)


def remove_signs(txt):
    rm_signs = emojis + puncs
    for s in rm_signs:
        txt = txt.replace(s, " ")
    return txt


def clean_txt(txt):
    txt = han2zen(txt)
    txt = remove_signs(txt)
    txt_list = txt.upper().split()
    txt_list = [
        x for x in txt_list if len(x) > 1 and re.search(r"[亜-熙ぁ-んァ-ヶa-zA-Z]", x)
    ]
    return " ".join(list(Counter(txt_list)))


In [86]:
t = " ".join(["bike", "bike", "handle", "saddle", "2020", "#"])
print("t: ", t)
print(clean_txt(t))


t:  bike bike handle saddle 2020 #
['BIKE', 'HANDLE', 'SADDLE']


In [87]:
data.title.head()


0       テレカ 未使用品 仮面ライダー 生誕20周年 東映ビデオ販売 販促用非売品
1       送料無料 新品 DVD Perfume WORLD TOUR 1st 初回
2                      ☆ウルトラマン☆レーザーディスク ジャンク？
3         MAISHA （マイシャ）Sadao Watanabe　渡辺貞夫★LD
4    LD★PARKER'S MOOD（パーカーズ・ムード）LIVE AT BRAVA
Name: title, dtype: object

In [94]:
data["clean_title"]=data.title.apply(clean_txt)

In [88]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


In [96]:
category2idx = {c: idx for idx, c in enumerate(data.category_name.unique())}
idx2category = {idx: c for idx, c in enumerate(data.category_name.unique())}


In [98]:
X=data.clean_title
y=data.category_name.apply(lambda x:category2idx[x])


## データを学習させる

In [119]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42,shuffle=True
)

In [126]:
X_train

12762    [MINT, 新品未使用, メダル大集合, ドラえもん, ポケモン, カーズ]
11121                  [すみっコぐらし, ブラシ付ミラー, 新品未使用]
47140       [ADIDAS, PHARRELL, HUMAN, RACE, NMD]
62381                            [PS, ウイニングイレブン]
22213                         [ジョジョと奇妙な冒険, シーザー]
                          ...                   
62570      [コールオブデューティアドバンスド・ウォーフェア, CALLOFDUTY]
38158          [新品, 鴻池剛, ニャアアアン, 猫♡ぽんた♡スウェット♡半袖]
860              [廃盤LP, 沢田研二, AUX, FEMMES, 女たちよ]
15795        [MINT, レア, キリ番, マイメロディ, バースデーマスコット]
56422             [掛軸, 小林雄山, 四季花図, 絹本, 共箱付, 掛け軸]
Name: clean_title, Length: 44104, dtype: object

In [120]:
pipeline = Pipeline(
    [
        ("bow", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("classifier", RandomForestClassifier()),
    ]
)

In [125]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 44104 entries, 12762 to 56422
Series name: category_name
Non-Null Count  Dtype
--------------  -----
44104 non-null  int64
dtypes: int64(1)
memory usage: 1.7 MB


In [122]:
pipeline.fit(X_train,y_train)

AttributeError: 'list' object has no attribute 'lower'