In [39]:
import numpy as np
import pandas as pd
import sklearn

train_docs = pd.read_csv('../datasets/example_train.csv')
train_docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,education
1,Educational greatness depends on ethics,education
2,A story of great ethics and educational greatness,education
3,Sholey is a great cinema,cinema
4,good movie depends on good story,cinema


In [40]:
train_docs['Class'] = train_docs.Class.map({'education' : '1', 'cinema' : '0'})
train_docs

Unnamed: 0,Document,Class
0,Upgrad is a great educational institution.,1
1,Educational greatness depends on ethics,1
2,A story of great ethics and educational greatness,1
3,Sholey is a great cinema,0
4,good movie depends on good story,0


In [41]:
train_array = train_docs.values
X_train = train_array[:, 0]
y_train = train_array[:, 1].astype('int')

print("X_train: ", X_train)
print("y_train: ", y_train)

X_train:  ['Upgrad is a great educational institution.'
 'Educational greatness depends on ethics'
 'A story of great ethics and educational greatness'
 'Sholey is a great cinema' 'good movie depends on good story']
y_train:  [1 1 1 0 0]


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
vec = CountVectorizer(stop_words='english')

In [44]:
vec.fit(X_train)
vec.vocabulary_

{'upgrad': 11,
 'great': 5,
 'educational': 2,
 'institution': 7,
 'greatness': 6,
 'depends': 1,
 'ethics': 3,
 'story': 10,
 'sholey': 9,
 'cinema': 0,
 'good': 4,
 'movie': 8}

In [45]:
print(vec.get_feature_names_out())

['cinema' 'depends' 'educational' 'ethics' 'good' 'great' 'greatness'
 'institution' 'movie' 'sholey' 'story' 'upgrad']


In [46]:
X_transformed = vec.transform(X_train)
print(X_transformed)

  (0, 2)	1
  (0, 5)	1
  (0, 7)	1
  (0, 11)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
  (2, 6)	1
  (2, 10)	1
  (3, 0)	1
  (3, 5)	1
  (3, 9)	1
  (4, 1)	1
  (4, 4)	2
  (4, 8)	1
  (4, 10)	1


In [47]:
pd.DataFrame(X_transformed.toarray(), columns=vec.get_feature_names_out())

Unnamed: 0,cinema,depends,educational,ethics,good,great,greatness,institution,movie,sholey,story,upgrad
0,0,0,1,0,0,1,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,0
2,0,0,1,1,0,1,1,0,0,0,1,0
3,1,0,0,0,0,1,0,0,0,1,0,0
4,0,1,0,0,2,0,0,0,1,0,1,0


In [48]:
test_docs = pd.read_csv('../datasets/example_test.csv')
test_docs['Class'] = test_docs.Class.map({'cinema':'0', 'education':'1'})
test_numpy_array = test_docs.values
X_test = test_numpy_array[:, 0]
y_test = test_numpy_array[:, 1]

X_test_transformed = vec.transform(X_test)
X_test = X_test_transformed

In [49]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

mnb.fit(X_transformed, y_train)

mnb.predict_proba(X_test)

array([[0.32808399, 0.67191601]])