In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import tinysegmenter

In [2]:
# Import Training Data
devdata_file = pd.ExcelFile("devdata.xlsx")
devdata = pd.read_excel(devdata_file)

In [3]:
# Import Test Data
test_file = pd.ExcelFile("勘定科目一覧_回答用10社（配布用）.xlsx")
test_df = pd.read_excel(test_file)

In [4]:
test_df

Unnamed: 0,COMPANY,ACCOUNTING_SYSTEM,ACCOUNT_NUMBER,ACCOUNT_NAME,ACCOUNT_GROUP
0,COMP_1,Super Stream,1111.0,現金,
1,COMP_1,Super Stream,1112.0,小口現金,
2,COMP_1,Super Stream,1113.0,つり銭,
3,COMP_1,Super Stream,1114.0,売上現金,
4,COMP_1,Super Stream,1121.0,当座預金,
5,COMP_1,Super Stream,1122.0,普通預金,
6,COMP_1,Super Stream,1123.0,通知預金,
7,COMP_1,Super Stream,1125.0,定期預金,
8,COMP_1,Super Stream,1126.0,別段預金,
9,COMP_1,Super Stream,1127.0,外貨預金,


In [5]:
# replace nan with '' in ACCOUNT_NAME
devdata.ACCOUNT_NAME.fillna('', inplace=True)
test_df.ACCOUNT_NAME.fillna('', inplace=True)

In [6]:
segmenter = tinysegmenter.TinySegmenter() # Japanese Language Word-Split tokenizer
vectorizer = CountVectorizer(ngram_range=(1,3),tokenizer=segmenter.tokenize) # Use 3-Gram Bag of Words as vectorizer

In [7]:
train_transform_matrix = vectorizer.fit_transform(devdata['ACCOUNT_NAME'])

In [8]:
print(train_transform_matrix.shape) # Check the shape of transformed data

(11349, 27968)


In [9]:
# Convert bow_matrix into a DataFrame
train_transform_df = pd.DataFrame(train_transform_matrix.toarray())

In [10]:
# Map the column names to vocabulary 
train_transform_df.columns = vectorizer.get_feature_names()

In [11]:
print(train_transform_df.columns)

Index([' ', '  ', '   ', '    ', '      ', '       ', '     _', '     丸ノ',
       '     本店', '     東京',
       ...
       'ﾟ ﾛﾊﾞｲｵ 設計', 'ﾟ ﾛﾊﾞｲｵ 間接', 'ﾟ ﾝ', 'ﾟ ﾝｸﾁｬｰﾊﾞﾙﾌ', 'ﾟ ﾝｸﾞ', 'ﾟ ﾝｸﾞ )',
       'ﾟ ﾝﾄﾞﾙ', 'ﾟ ﾝﾄﾞﾙｶﾞｲﾄ', 'ﾟ ﾝﾌ', 'ﾟ ﾝﾌ ﾟ'],
      dtype='object', length=27968)


In [12]:
train_transform_df.tail(15)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,_,丸ノ,本店,東京,...,ﾟ ﾛﾊﾞｲｵ 設計,ﾟ ﾛﾊﾞｲｵ 間接,ﾟ ﾝ,ﾟ ﾝｸﾁｬｰﾊﾞﾙﾌ,ﾟ ﾝｸﾞ,ﾟ ﾝｸﾞ ),ﾟ ﾝﾄﾞﾙ,ﾟ ﾝﾄﾞﾙｶﾞｲﾄ,ﾟ ﾝﾌ,ﾟ ﾝﾌ ﾟ
11334,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11339,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11340,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11341,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11342,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11343,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Bag of Words (3-gram) modeling

In [13]:
clf = MultinomialNB() # set Multinominal Naive Bayes Classifier of self predection

In [14]:
y_train = devdata['ACCOUNT_GROUP'].values

In [15]:
# Fit the classifier
clf.fit(train_transform_df, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# Measure the accuracy
accuracy = clf.score(train_transform_df, y_train)
print("自分自身に対する予測精確率（過剰適合ですが...） %.3f" % accuracy)


自分自身に対する予測精確率（過剰適合ですが...） 0.912


# Test Data Prediction

In [17]:
# transform test data based on the train data transformation matrix 
test_transform_df = vectorizer.transform(test_df['ACCOUNT_NAME'])

In [18]:
test_df

Unnamed: 0,COMPANY,ACCOUNTING_SYSTEM,ACCOUNT_NUMBER,ACCOUNT_NAME,ACCOUNT_GROUP
0,COMP_1,Super Stream,1111.0,現金,
1,COMP_1,Super Stream,1112.0,小口現金,
2,COMP_1,Super Stream,1113.0,つり銭,
3,COMP_1,Super Stream,1114.0,売上現金,
4,COMP_1,Super Stream,1121.0,当座預金,
5,COMP_1,Super Stream,1122.0,普通預金,
6,COMP_1,Super Stream,1123.0,通知預金,
7,COMP_1,Super Stream,1125.0,定期預金,
8,COMP_1,Super Stream,1126.0,別段預金,
9,COMP_1,Super Stream,1127.0,外貨預金,


In [19]:
y_pred = clf.predict(test_transform_df)

In [20]:
y_pred_prob = clf.predict_proba(test_transform_df)[:,1]

In [22]:
df_preds = pd.DataFrame(
    {'予測ACCOUNT_GROUP': y_pred,
     '予測の確率': y_pred_prob
    })

In [24]:
df_answer = pd.concat([test_df.reset_index(drop=True), df_preds], axis=1)

In [28]:
df_answer.to_csv('勘定科目一覧_回答用10社（3-Gram Model予測）_王.csv', index = False, encoding = 'ANSI')