In [28]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder 

In [2]:
devdata_file = pd.ExcelFile("devdata.xlsx")
devdata = pd.read_excel(devdata_file)

In [3]:
# replace nan with '' in ACCOUNT_NAME
devdata.ACCOUNT_NAME.fillna('', inplace=True)

In [4]:
devdata_train, devdata_test = train_test_split(devdata, test_size = 0.2, random_state=200)

In [5]:
vectorizer_ng3 = CountVectorizer(ngram_range=(1,3))

In [6]:
train_transform_matrix = vectorizer_ng3.fit_transform(devdata_train['ACCOUNT_NAME'])

In [7]:
print(train_transform_matrix.shape)

(9079, 8734)


In [8]:
train_transform_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
# Convert bow_matrix into a DataFrame
train_transform_df = pd.DataFrame(train_transform_matrix.toarray())

In [10]:
train_transform_df.tail(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8724,8725,8726,8727,8728,8729,8730,8731,8732,8733
9069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9072,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9073,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Map the column names to vocabulary 
train_transform_df.columns = vectorizer_ng3.get_feature_names()

In [12]:
train_transform_df.tail(10)

Unnamed: 0,000tgc使用料,000円以下,000円以下の飲食代,000円超,000円超の飲食代,0261297,0762210,0786203,0814262,10万円から,...,ﾚｽﾄﾗﾝ飲料収入割引,ﾚｽﾄﾗﾝ飲料収入割引 その他,ﾚｽﾄﾗﾝ飲料収入割引 割引ﾁｹｯﾄ類,ﾚｽﾄﾗﾝ飲料収入割引 ｱｿｼｴｰﾂ50,ﾚｽﾄﾗﾝ飲料収入割引 ｸﾞﾙｰﾌﾟ,ﾚｽﾄﾗﾝ飲料収入割引 ｸﾞﾙｰﾌﾟ 会員,ﾚｽﾄﾗﾝ飲料収入割引 ｸﾞﾙﾒﾊﾟｽﾎﾟｰﾄ,ﾚｽﾄﾗﾝ飲料収入割引 ﾚﾃﾞｨｰｽﾃﾞｰ,ﾚﾃﾞｨｰｽﾃﾞｰ,ﾛｲﾔﾘﾃｨ
9069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9072,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9073,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9074,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# transform
test_transform_df = vectorizer_ng3.transform(devdata_test['ACCOUNT_NAME'])

# Bag of Words (BOW) modeling

In [14]:
clf = MultinomialNB()

In [15]:
y_train = devdata_train['ACCOUNT_GROUP'].values
y_test = devdata_test['ACCOUNT_GROUP'].values

In [16]:
# combine the other features with bag of words
train_transform_df

Unnamed: 0,000tgc使用料,000円以下,000円以下の飲食代,000円超,000円超の飲食代,0261297,0762210,0786203,0814262,10万円から,...,ﾚｽﾄﾗﾝ飲料収入割引,ﾚｽﾄﾗﾝ飲料収入割引 その他,ﾚｽﾄﾗﾝ飲料収入割引 割引ﾁｹｯﾄ類,ﾚｽﾄﾗﾝ飲料収入割引 ｱｿｼｴｰﾂ50,ﾚｽﾄﾗﾝ飲料収入割引 ｸﾞﾙｰﾌﾟ,ﾚｽﾄﾗﾝ飲料収入割引 ｸﾞﾙｰﾌﾟ 会員,ﾚｽﾄﾗﾝ飲料収入割引 ｸﾞﾙﾒﾊﾟｽﾎﾟｰﾄ,ﾚｽﾄﾗﾝ飲料収入割引 ﾚﾃﾞｨｰｽﾃﾞｰ,ﾚﾃﾞｨｰｽﾃﾞｰ,ﾛｲﾔﾘﾃｨ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
train_other_df = devdata_train[['COMPANY', 'ACCOUNTING_SYSTEM', 'BP_DIV']]
test_other_df = devdata_test[['COMPANY', 'ACCOUNTING_SYSTEM', 'BP_DIV']]

In [50]:
le = LabelEncoder()


train_other_df['COMPANY'] = le.fit_transform(train_other_df['COMPANY'])
train_other_df['ACCOUNTING_SYSTEM'] = le.fit_transform(train_other_df['ACCOUNTING_SYSTEM'])
train_other_df['BP_DIV'] = le.fit_transform(train_other_df['BP_DIV'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [51]:

test_other_df['COMPANY'] = le.fit_transform(test_other_df['COMPANY'])
test_other_df['ACCOUNTING_SYSTEM'] = le.fit_transform(test_other_df['ACCOUNTING_SYSTEM'])
test_other_df['BP_DIV'] = le.fit_transform(test_other_df['BP_DIV'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [49]:
test_other_df.head(5)

Unnamed: 0,COMPANY,ACCOUNTING_SYSTEM,BP_DIV
51,1,6,0
286,17,10,0
242,17,10,0
11020,23,11,0
7349,28,10,0


In [52]:
test_contat

NameError: name 'test_contat' is not defined

In [45]:
train_contat = pd.concat([train_other_df.reset_index(drop=True), train_transform_df], axis=1)

test_contat = pd.concat([test_other_df.reset_index(drop=True), test_transform_df], axis=1)


TypeError: cannot concatenate object of type "<class 'scipy.sparse.csr.csr_matrix'>"; only pd.Series, pd.DataFrame, and pd.Panel (deprecated) objs are valid

In [46]:
test_contat

NameError: name 'test_contat' is not defined

In [41]:
# Fit the classifier
clf.fit(train_contat, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
# Measure the accuracy
accuracy = clf.score(test_transform_df, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)


ValueError: dimension mismatch