In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import cv2
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
data_frame = pd.read_csv("data_train.csv", encoding='utf-8')

In [3]:
data = []
labels = []

In [4]:
print(data_frame.head(3))

                                            Sentence Labels
0  Excellent! Our sales are still going up untill...  happy
1                            He is an awesome people  happy
2  How lucky you are! You?ve just won a valuable ...  happy


In [5]:
print("Number of data to train : ",len(data_frame))
print("Shape of data_frame", data_frame.shape)

Number of data to train :  149
Shape of data_frame (149, 2)


In [6]:
data = data_frame['Sentence']
print(data[0:5])
labels = data_frame['Labels']
print(labels[0:5])

0    Excellent! Our sales are still going up untill...
1                              He is an awesome people
2    How lucky you are! You?ve just won a valuable ...
3                        I have nothing more to desire
4                       Nothing could make me happier!
Name: Sentence, dtype: object
0    happy
1    happy
2    happy
3    happy
4    happy
Name: Labels, dtype: object


In [7]:
# Stop words
stopWords = set(stopwords.words('english'))
data_edited = []
for sentences in data:
    str = ""
    words = word_tokenize(sentences)
    for w in words:
        if w not in stopWords:
            str += w + " "
    data_edited.append(str)
data_edited = np.asanyarray(data_edited)
data_edited = data_edited.reshape((149, -1))
print(data_edited[0: 5][:])
data_train = data_edited

[['Excellent ! Our sales still going untill ']
 ['He awesome people ']
 ['How lucky ! You ? valuable award ']
 ['I nothing desire ']
 ['Nothing could make happier ! ']]


In [8]:
# Extract feature Bag_of_word
tf_idf = TfidfVectorizer( ngram_range= (1,2) , token_pattern='(?u)\\b\\w+\\b')
X = tf_idf.fit_transform(data_edited.reshape(149))
print(len(tf_idf.get_feature_names()))
print(tf_idf.get_feature_names())


648
['10', '10 points', 'a', 'a boring', 'a damp', 'a sombre', 'a work', 'accept', 'accept invitation', 'accepted', 'accepted role', 'alas', 'alas stocks', 'alone', 'always', 'always smile', 'always smiled', 'amuse', 'amuse playing', 'amused', 'amused children', 'amused clown', 'amused learn', 'anything', 'anything right', 'approval', 'art', 'art given', 'award', 'away', 'awesome', 'awesome people', 'baby', 'baby amused', 'back', 'bad', 'bastard', 'become', 'become worthless', 'behind', 'behind back', 'believe', 'believe talking', 'birthday', 'bitch', 'bitter', 'bitter smile', 'book', 'boring', 'boring day', 'bother', 'boy', 'boy lost', 'break', 'breaking', 'broke', 'business', 'ca', 'ca n', 'character', 'cheerful', 'cheerful face', 'cheerful smile', 'cheerless', 'cheerless losts', 'cheerless room', 'chess', 'child', 'child bad', 'child cried', 'child said', 'children', 'chirping', 'chirping crickets', 'christoper', 'christoper man', 'clown', 'clown s', 'comic', 'comic gestures', 'coul

In [9]:
print(X.shape)
X = X.toarray()

(149, 648)


In [10]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10,7,5),max_iter= 2000).fit(X, labels)



In [11]:
predict = mlp.predict(X)
from sklearn.metrics import accuracy_score
print("Hieu qua mo hinh dat :", 100* accuracy_score(labels, predict.tolist() ) )

Hieu qua mo hinh dat : 100.0


In [12]:
data1 = "today i am very happy and smile"
words = word_tokenize(data1)
string = ""
for w in words:
    if w not in stopWords:
        string += w + " "
print(string)


today happy smile 


In [13]:
# Extract feature
feature_test = np.zeros(len(tf_idf.get_feature_names()))

tf_idf2 =  TfidfVectorizer( ngram_range= (1,2) , token_pattern='(?u)\\b\\w+\\b')
X2 = tf_idf2.fit_transform([string])
X_test = tf_idf2.get_feature_names()

print(X_test)

['happy', 'happy smile', 'smile', 'today', 'today happy']


In [14]:
print(X2.toarray())

[[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]


In [15]:
feature_train = tf_idf.get_feature_names()
for i in range(len(tf_idf.get_feature_names())):
    for w in X_test:
        if w == feature_train[i]:
            feature_test[i] = 1
predict_test = mlp.predict([feature_test])
print(predict_test)

['happy']


In [16]:
def extract_feature_of_test(data1):
    words = word_tokenize(data1)
    string = ""
    for w in words:
        if w not in stopWords:
            string += w + " "
    print("After discard : ",string)
    feature_test = np.zeros(len(tf_idf.get_feature_names()))

    tf_idf2 =  TfidfVectorizer( ngram_range= (1,2) , token_pattern='(?u)\\b\\w+\\b')
    X2 = tf_idf2.fit_transform([string])
    X_test = tf_idf2.get_feature_names()
    feature_train = tf_idf.get_feature_names()
    for i in range(len(tf_idf.get_feature_names())):
        for w in X_test:
            if w == feature_train[i]:
                feature_test[i] = 1
    return [feature_test]

In [20]:
data1 = "i kill you"
predict_test = mlp.predict(extract_feature_of_test(data1))
print(predict_test)

After discard :  want go zoo 
['angry']
