In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import tinysegmenter

In [2]:
devdata_file = pd.ExcelFile("devdata.xlsx")
devdata = pd.read_excel(devdata_file)

In [3]:
# replace nan with '' in ACCOUNT_NAME
devdata.ACCOUNT_NAME.fillna('', inplace=True)

In [4]:
devdata_train, devdata_test = train_test_split(devdata, test_size = 0.2, random_state=200)

In [6]:
segmenter = tinysegmenter.TinySegmenter()
vectorizer = TfidfVectorizer(ngram_range=(1,3),tokenizer=segmenter.tokenize)

In [7]:
train_transform_matrix = vectorizer.fit_transform(devdata_train['ACCOUNT_NAME'])

In [8]:
devdata_train['ACCOUNT_NAME'][0:20]

7444            用力費＿電力使用料
10220           供販その他租税課金
3531                少額資産費
11191                収入振替
9084            租税公課固定資産税
228            仮払金 適格年金第１
5967           半成工事 販売直接費
615           (特益)その他特別利益
8559          水道光熱費(電気料金)
5647             Ｔ／Ｃ  Ｄ部品
1792           給料手当－従業員給料
5846            製品仮（仕 経費）
4586                 構内請負
3637                未払給与金
7587                 未収収益
5042           仮払金Ｂ切手等引当金
6722            預り金Ｃ生保-三井
10277    附帯事業通報費通報サービス手数料
4931           法人、住民及び事業税
10800             その他営業外収
Name: ACCOUNT_NAME, dtype: object

In [9]:
print(train_transform_matrix.shape)

(9079, 24676)


In [10]:
train_transform_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
# Convert bow_matrix into a DataFrame
train_transform_df = pd.DataFrame(train_transform_matrix.toarray())

In [12]:
train_transform_df.tail(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24666,24667,24668,24669,24670,24671,24672,24673,24674,24675
9069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9070,0.114494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Map the column names to vocabulary 
train_transform_df.columns = vectorizer.get_feature_names()

In [14]:
train_transform_df.tail(10)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,_,丸ノ,本店,東京,...,ﾟ ﾛ環 技設,ﾟ ﾛ関係,ﾟ ﾛﾊﾞｲｵ,ﾟ ﾛﾊﾞｲｵ 見積,ﾟ ﾛﾊﾞｲｵ 設計,ﾟ ﾝｸﾁｬｰﾊﾞﾙﾌ,ﾟ ﾝｸﾞ,ﾟ ﾝｸﾞ ),ﾟ ﾝﾄﾞﾙ,ﾟ ﾝﾄﾞﾙｶﾞｲﾄ
9069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9070,0.114494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# transform
test_transform_df = vectorizer.transform(devdata_test['ACCOUNT_NAME'])

# Tfidf modeling

In [16]:
clf = MultinomialNB()

In [17]:
y_train = devdata_train['ACCOUNT_GROUP'].values
y_test = devdata_test['ACCOUNT_GROUP'].values

In [18]:
train_other_df = devdata_train[['COMPANY', 'ACCOUNTING_SYSTEM', 'BP_DIV', 'ACCOUNT_NAME']]

In [19]:
# Fit the classifier
clf.fit(train_transform_df, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
# Measure the accuracy
accuracy = clf.score(test_transform_df, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)


The accuracy of the classifier on the test set is 0.608
