In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')

columns = ['texto', 'classe']

rows = [['They are novels', 'stmt'],
        ['have you read this book', 'question'],
        ['who is the author', 'question'],
        ['what are the characters', 'question'],
        ['This is how I bought the book', 'stmt'],
        ['I like fictions', 'stmt'],
        ['what is your favorite book', 'question'],
        ['This is my book', 'stmt']
]

training_data = pd.DataFrame(rows, columns=columns)
training_data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,texto,classe
0,They are novels,stmt
1,have you read this book,question
2,who is the author,question
3,what are the characters,question
4,This is how I bought the book,stmt
5,I like fictions,stmt
6,what is your favorite book,question
7,This is my book,stmt


# Quantidade de vezes que uma palavra aparece em uma frase com classe stmt

In [None]:
stmt_docs = [row['texto'] for index,row in training_data.iterrows() if row['classe'] == 'stmt']
vec_s = CountVectorizer()
X_s = vec_s.fit_transform(stmt_docs)
tdm_s = pd.DataFrame(X_s.toarray(), columns=vec_s.get_feature_names())

tdm_s

Unnamed: 0,are,book,bought,fictions,how,is,like,my,novels,the,they,this
0,1,0,0,0,0,0,0,0,1,0,1,0
1,0,1,1,0,1,1,0,0,0,1,0,1
2,0,0,0,1,0,0,1,0,0,0,0,0
3,0,1,0,0,0,1,0,1,0,0,0,1


# Quantidade de vezes que uma palavra aparece em uma frase com classe question

In [None]:
q_docs = [row['texto'] for index,row in training_data.iterrows() if row['classe'] == 'question']

vec_q = CountVectorizer()
X_q = vec_q.fit_transform(q_docs)
tdm_q = pd.DataFrame(X_q.toarray(), columns=vec_q.get_feature_names())

tdm_q

Unnamed: 0,are,author,book,characters,favorite,have,is,read,the,this,what,who,you,your
0,0,0,1,0,0,1,0,1,0,1,0,0,1,0
1,0,1,0,0,0,0,1,0,1,0,0,1,0,0
2,1,0,0,1,0,0,0,0,1,0,1,0,0,0
3,0,0,1,0,1,0,1,0,0,0,1,0,0,1


# Frequência de cada palavra na classe stmt

In [None]:
word_list_s = vec_s.get_feature_names();    
count_list_s = X_s.toarray().sum(axis=0) 
freq_s = dict(zip(word_list_s,count_list_s))
freq_s

{'are': 1,
 'book': 2,
 'bought': 1,
 'fictions': 1,
 'how': 1,
 'is': 2,
 'like': 1,
 'my': 1,
 'novels': 1,
 'the': 1,
 'they': 1,
 'this': 2}

# Frequência de cada palavra na classe question

In [None]:
word_list_q = vec_q.get_feature_names();    
count_list_q = X_q.toarray().sum(axis=0) 
freq_q = dict(zip(word_list_q,count_list_q))
freq_q

{'are': 1,
 'author': 1,
 'book': 2,
 'characters': 1,
 'favorite': 1,
 'have': 1,
 'is': 2,
 'read': 1,
 'the': 2,
 'this': 1,
 'what': 2,
 'who': 1,
 'you': 1,
 'your': 1}

# Probabilidade das palavras na classe stmt

In [None]:
prob_s = []

for word, count in zip(word_list_s, count_list_s):
  prob_s.append(count/len(word_list_s))
dict(zip(word_list_s, prob_s))

{'are': 0.08333333333333333,
 'book': 0.16666666666666666,
 'bought': 0.08333333333333333,
 'fictions': 0.08333333333333333,
 'how': 0.08333333333333333,
 'is': 0.16666666666666666,
 'like': 0.08333333333333333,
 'my': 0.08333333333333333,
 'novels': 0.08333333333333333,
 'the': 0.08333333333333333,
 'they': 0.08333333333333333,
 'this': 0.16666666666666666}

# Probabilidade das palavras na classe question



In [None]:
prob_q = []

for count in count_list_q:
  prob_q.append(count/len(word_list_q))
dict(zip(word_list_q, prob_q))

{'are': 0.07142857142857142,
 'author': 0.07142857142857142,
 'book': 0.14285714285714285,
 'characters': 0.07142857142857142,
 'favorite': 0.07142857142857142,
 'have': 0.07142857142857142,
 'is': 0.14285714285714285,
 'read': 0.07142857142857142,
 'the': 0.14285714285714285,
 'this': 0.07142857142857142,
 'what': 0.14285714285714285,
 'who': 0.07142857142857142,
 'you': 0.07142857142857142,
 'your': 0.07142857142857142}

In [None]:

docs = [row['texto'] for index,row in training_data.iterrows()]

vec = CountVectorizer()
X = vec.fit_transform(docs)

total_features = len(vec.get_feature_names())
total_features

21

In [None]:
total_cnts_features_s = count_list_s.sum(axis=0)
total_cnts_features_q = count_list_q.sum(axis=0)

print(total_cnts_features_s, total_cnts_features_q)

15 18


In [None]:
new_sentence = 'what do you mean'
new_word_list = word_tokenize(new_sentence)

# Calculo das probablidades para cada palavra na classe stmt

In [None]:
prob_s_with_ls = []
for word in new_word_list:
    if word in freq_s.keys():
        count = freq_s[word]
    else:
        count = 0
    prob_s_with_ls.append((count + 1)/(total_cnts_features_s + total_features))
final_prob_stmt = dict(zip(new_word_list,prob_s_with_ls))
final_prob_stmt

{'do': 0.027777777777777776,
 'mean': 0.027777777777777776,
 'what': 0.027777777777777776,
 'you': 0.027777777777777776}

# Calculo das probablidades para cada palavra na classe question

In [None]:
prob_q_with_ls = []
for word in new_word_list:
    if word in freq_q.keys():
        count = freq_q[word]
    else:
        count = 0
    prob_q_with_ls.append((count + 1)/(total_cnts_features_q + total_features))
final_prob_question = dict(zip(new_word_list,prob_q_with_ls))

final_prob_question

{'do': 0.02564102564102564,
 'mean': 0.02564102564102564,
 'what': 0.07692307692307693,
 'you': 0.05128205128205128}

# Verificando a classe da nova frase

In [None]:
total_prob_question = 0
total_prob_stmt = 0

for key in final_prob_question:
  total_prob_question += final_prob_question[key]

for key in final_prob_stmt:
  total_prob_stmt += final_prob_stmt[key]

if (total_prob_question > total_prob_stmt):
  print('A classe da frase é Question')
else:
  print('A classe da frase é Stmt')
  

A classe da frase é Question
