### Sample program for Naive Bayes Classifier (Japanese sentences)  
Spaces should be added between Japanese words (wakati-gaki)  
単純ベイズ分類器のサンプルプログラム(日本語文)  
日本語文は分かち書きになっている必要がある。    

#### Import libraries  

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#### Parameters  

In [None]:
csv_in = 'livedoor_news50.csv'

# To show more columns and rows  
pd.options.display.max_columns=999
pd.options.display.max_rows=999

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, skiprows=0, delimiter=',', header=0)
display(df.head())

#### Check categories in the data  

In [None]:
print(df['category'].value_counts())

#### Obtain X (data) and y (true label)     

In [None]:
X = df['text']
y = df['category']

#### Divide data into train data and test data  
NOTE: Specify random_state to fix the way of division  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.25)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

#### Collect words  

In [None]:
vectorizer = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
vectorizer.fit(X_train)
vocab = vectorizer.get_feature_names()
print('Vocabulary size:', len(vocab))
print(vocab[:10])  # debug

#### Make BoW (word frequency vectors)     

In [None]:
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)
print('X_train_bow:')
print(repr(X_train_bow))
print('X_test_bow:')
print(repr(X_test_bow))

#### Display BoW  

In [None]:
Xbow = pd.DataFrame(X_train_bow.toarray(), 
                    index=y_train, columns=vocab)
display(Xbow.head())

#### Training of naive Bayes classifier  

In [None]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train_bow, y_train)
print(model.classes_)
train_score = model.score(X_train_bow, y_train)
print('Train accuracy:', train_score)

#### Prediction using naive Bayes classifier after training  

In [None]:
# Obtain each prediction  
y_test_pred = model.predict(X_test_bow)
df_pred = pd.DataFrame({
            'pred': y_test_pred,
            'true': y_test
          }).reset_index(drop=True)
display(df_pred.head(10))

In [None]:
# Make crosstable  
ctab = pd.crosstab(df_pred['pred'], df_pred['true'])
display(ctab)

In [None]:
# Prediction accuracy  
test_score = model.score(X_test_bow, y_test)
print('Test accuracy:', test_score)

#### Set min_df    

In [None]:
vectorizer = CountVectorizer(min_df=4, token_pattern='(?u)\\b\\w+\\b')
vectorizer.fit(X_train)
vocab = vectorizer.get_feature_names()
print('Vocabulary size:', len(vocab))
X_train_bow = vectorizer.transform(X_train)
print(repr(X_train_bow))
X_test_bow = vectorizer.transform(X_test)
model = MultinomialNB(alpha=1.0)
model.fit(X_train_bow, y_train)
print('Train accuracy:', model.score(X_train_bow, y_train))
print('Test accuracy:', model.score(X_test_bow, y_test))

In [None]:
# Obtain each prediction  
y_test_pred = model.predict(X_test_bow)
df_pred = pd.DataFrame({
            'pred': y_test_pred,
            'true': y_test
          }).reset_index(drop=True)
ctab = pd.crosstab(df_pred['pred'], df_pred['true'])
display(ctab)