In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime
import connexion

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

In [66]:
df = pd.read_table('labelled.txt', names=['comentario', 'avaliacao'], delimiter='\t', quoting=3)

#### Check balance

In [67]:
df.avaliacao.value_counts()

1    1500
0    1500
Name: avaliacao, dtype: int64

#### Preparing data and constructing the model

In [68]:
X = df['comentario'].values
y = df['avaliacao'].values

In [69]:
x1, x2, y1, y2 = train_test_split(X, y, test_size=0.25, random_state=42)

In [86]:
# List frenquency to model
vetorizer = CountVectorizer(binary=True).fit(x1)
x1_vet = vetorizer.transform(x1)

# saving vectorization
name = r'vectorizer/list_frequency.vec'
pickle.dump(vetorizer, open(name,'wb'))

In [71]:
clf = BernoulliNB()
clf.fit(x1_vet, y1)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [72]:
preds = clf.predict(vetorizer.transform(x2))

In [73]:
confusion_matrix(y2, preds)

array([[313,  63],
       [ 74, 300]], dtype=int64)

In [74]:
test = np.array(['beautiful day', 'worst cellphone of the world!'])

In [75]:
clf.predict(vetorizer.transform(test))

array([1, 0], dtype=int64)

#### Saving the model

In [76]:
# SAving model

name = r'pickles/text_binary_clas.save'
pickle.dump(clf, open(name,'wb'))

In [50]:
# Load model

classifier = joblib.load('./pickles/text_binary_clas.save')

In [84]:
# load vetorizer

name = r'pickles/list_frequency.vec'
vecTest = pickle.load(open(name, 'rb'))

<2x4347 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [85]:
classifier.predict(vecTest.transform(test))

array([1, 0], dtype=int64)