### Sample program for Naive Bayes Classifier (small test sample)   
単純ベイズ分類器のサンプルプログラム(小サンプルでの実行例)  

#### Import libraries  

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#### Make DataFrame  

In [2]:
df = pd.DataFrame(
   [
       ['Apple', 'red  sweetness  skin  seeds'],
       ['Strawberry', 'red  sweetness  sourness  sweetness'],
       ['Orange', ' orange  skin  sourness  sweetness  '],
   ])
df.columns = ['category', 'text']
print(df.shape)
print(df.info())
display(df.head())

(3, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  3 non-null      object
 1   text      3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes
None


Unnamed: 0,category,text
0,Apple,red sweetness skin seeds
1,Strawberry,red sweetness sourness sweetness
2,Orange,orange skin sourness sweetness


#### Obtain X_train (data) and y_train (true category)  

In [3]:
X_train = df['text']
y_train = df['category']
print(X_train)
print(y_train)

0             red  sweetness  skin  seeds
1     red  sweetness  sourness  sweetness
2     orange  skin  sourness  sweetness  
Name: text, dtype: object
0         Apple
1    Strawberry
2        Orange
Name: category, dtype: object


#### Specify X_test and y_test  

In [4]:
X_test = ['skin sweetness sourness']
print(X_test)

['skin sweetness sourness']


#### Collect words  

In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer. get_feature_names()
print('Vocabulary size:', len(vocab))
print(vocab)

Vocabulary size: 6
['orange', 'red', 'seeds', 'skin', 'sourness', 'sweetness']


#### Make BoW (word frequency vectors)     

In [6]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>
X_test_bow:
<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>


#### Display BoW  

In [7]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow)

Unnamed: 0_level_0,orange,red,seeds,skin,sourness,sweetness
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Apple,0,1,1,1,0,1
Strawberry,0,1,0,0,1,2
Orange,1,0,0,1,1,1


#### Training of naive Bayes classifier  

In [8]:
model = MultinomialNB(alpha=1.0, class_prior=[15/50,10/50,25/50])
model.fit(X_train_bow, y_train)
print(model.classes_)
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

['Apple' 'Orange' 'Strawberry']
Train accuracy: 1.0


#### Prediction using naive Bayes classifier after training  

In [9]:
X_test_bow

<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [10]:
proba = model.predict_proba(X_test_bow)
results = pd.DataFrame(proba, columns=model.classes_)
print('Prediction:')
display(results)
print(model.predict(X_test_bow))

Prediction:


Unnamed: 0,Apple,Orange,Strawberry
0,0.206897,0.275862,0.517241


['Strawberry']


In [11]:
print(proba)

[[0.20689655 0.27586207 0.51724138]]
