### Sample program for Naive Bayes Classifier (small test sample)   
単純ベイズ分類器のサンプルプログラム(小サンプルでの実行例)  

#### Import libraries  

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#### Make DataFrame  

In [2]:
df = pd.DataFrame(
   [
       ['O', 'USA great Democrats care mother'],
       ['H', 'mother love Democrats care mother USA'],
       ['T', 'Russia fake USA Mexico great Mexico haters'],
   ])
df.columns = ['category', 'text']
print(df.shape)
print(df.info())
display(df.head())

(3, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  3 non-null      object
 1   text      3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes
None


Unnamed: 0,category,text
0,O,USA great Democrats care mother
1,H,mother love Democrats care mother USA
2,T,Russia fake USA Mexico great Mexico haters


#### Obtain X_train (data) and y_train (true category)  

In [3]:
X_train = df['text']
y_train = df['category']
print(X_train)
print(y_train)

0               USA great Democrats care mother
1         mother love Democrats care mother USA
2    Russia fake USA Mexico great Mexico haters
Name: text, dtype: object
0    O
1    H
2    T
Name: category, dtype: object


#### Specify X_test and y_test  

In [4]:
X_test = ['USA Democrats mother']
print(X_test)

['USA Democrats mother']


#### Collect words  

In [5]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)
vocab = vectorizer. get_feature_names()
print('Vocabulary size:', len(vocab))
print(vocab)

Vocabulary size: 10
['care', 'democrats', 'fake', 'great', 'haters', 'love', 'mexico', 'mother', 'russia', 'usa']


#### Make BoW (word frequency vectors)     

In [6]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

X_train_bow:
<3x10 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>
X_test_bow:
<1x10 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>


#### Display BoW  

In [7]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow)

Unnamed: 0_level_0,care,democrats,fake,great,haters,love,mexico,mother,russia,usa
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
O,1,1,0,1,0,0,0,1,0,1
H,1,1,0,0,0,1,0,2,0,1
T,0,0,1,1,1,0,2,0,1,1


#### Training of naive Bayes classifier  

In [8]:
model = MultinomialNB(alpha=1.0, class_prior=[15/50,10/50,25/50])
model.fit(X_train_bow, y_train)
print(model.classes_)
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

['H' 'O' 'T']
Train accuracy: 1.0


#### Prediction using naive Bayes classifier after training  

In [12]:
X_test_bow

<1x10 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [13]:
proba = model.predict_proba(X_test_bow)
results = pd.DataFrame(proba, columns=model.classes_)
print('Prediction:')
display(results)
print(model.predict(X_test_bow))

Prediction:


Unnamed: 0,H,O,T
0,0.56466,0.304573,0.130767


['H']


In [14]:
print(proba)

[[0.56466036 0.30457269 0.13076695]]
