In [None]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
import pandas as pd

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
' :: '.join(brown.categories())

'adventure :: belles_lettres :: editorial :: fiction :: government :: hobbies :: humor :: learned :: lore :: mystery :: news :: religion :: reviews :: romance :: science_fiction'

In [None]:
first_category = 'government'
second_category = 'romance'

first_corpus = [' '.join(words) for words in brown.sents(categories=[first_category])]
second_corpus = [' '.join(words) for words in brown.sents(categories=[second_category])]
df1 = pd.DataFrame()
df1['text'] = first_corpus
df1['category'] = first_category
df2 = pd.DataFrame()
df2['text'] = second_corpus
df2['category'] = second_category
df = pd.concat([df1, df2], ignore_index=True)
df.head()

Unnamed: 0,text,category
0,The Office of Business Economics ( OBE ) of th...,government
1,"It develops and analyzes the national income ,...",government
2,Such measures are essential to its job of pres...,government
3,Contact,government
4,"For further information contact Director , Off...",government


In [None]:
whole_corpus = df['text']
y = df['category']
whole_corpus[0]

'The Office of Business Economics ( OBE ) of the U.S. Department of Commerce provides basic measures of the national economy and current analysis of short-run changes in the economic situation and business outlook .'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(whole_corpus)
X[0]

<1x11510 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

# ML process

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

## Apply algorithm

In [None]:
from sklearn.naive_bayes import MultinomialNB

bayes = MultinomialNB()

bayes_fit = bayes.fit(X_train, y_train)

y_pred = bayes_fit.predict(X_test)

In [None]:
y_test.value_counts()

romance       1301
government     938
Name: category, dtype: int64

## Quality check

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print('We achieved accuracy %s with Naive Bayes' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred, target_names=[first_category, second_category]))

We achieved accuracy 0.9316659222867352 with Naive Bayes
              precision    recall  f1-score   support

  government       0.97      0.86      0.91       938
     romance       0.91      0.98      0.94      1301

    accuracy                           0.93      2239
   macro avg       0.94      0.92      0.93      2239
weighted avg       0.93      0.93      0.93      2239



In [None]:
def predict_new_sentence(sentence):
  X = vectorizer.transform([sentence])
  return bayes_fit.predict_proba(X)

gov_sample = "mr president rules over country"
print(predict_new_sentence(gov_sample))

[[0.74675201 0.25324799]]


In [None]:
romance_sample = "You are my love!"
print(predict_new_sentence(romance_sample))

[[0.06508427 0.93491573]]


In [None]:
romance_sample = "Two countries quarrel about a land between them"
print(predict_new_sentence(romance_sample))

[[0.79606349 0.20393651]]


In [None]:
sample = "I am watching news about president"
print(predict_new_sentence(sample))

[[0.31164133 0.68835867]]
