# Model Evaluation

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
# laod the dataset
data = pd.read_csv("/content/txtEmotion.csv", index_col=0)

# split the data to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.text, data.label,
                                                   test_size= 0.2, 
                                                   random_state=42)

## Naive Bayes

In [3]:
nb = Pipeline([('vectorizer', TfidfVectorizer(max_features=13000)),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)
print('accuracy {:.2f}'.format(nb.score(X_test, y_test) * 100))

accuracy 61.65


## Linear SVM

In [4]:
sgd = Pipeline([('tfidf', TfidfVectorizer(max_features=13000)),
                ('clf', SGDClassifier(loss='hinge', 
                                      penalty='l2',
                                      alpha=1e-3, 
                                      random_state= 30, 
                                      max_iter=5, 
                                      tol=None)),
               ])
sgd.fit(X_train, y_train)
print('accuracy {:.2f}'.format(sgd.score(X_test, y_test) * 100))

accuracy 65.12


## DNN

In [10]:
from helpers import clean_text, create_encoder
from helpers import create_tokenizer, max_length, encode_text
from helpers import encode_label, define_model

In [6]:
trainLines, trainLabels = X_train, y_train
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

#create label encoder
encoder = create_encoder(y_train)
# encode labels 
trainY = encode_label(encoder, y_train)

Max document length: 64
Vocabulary size: 26360
(26720, 64)


In [7]:
# define model
model, callback = define_model(length, 13000)
# fit model
model.fit(trainX, trainY, 
          epochs=10, batch_size=32, 
          validation_split= 0.15,
          callbacks=[callback])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 64, 100)           1300000   
                                                                 
 conv1d (Conv1D)             (None, 63, 8)             1608      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 31, 8)            0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 31, 8)             0         
                                                                 
 conv1d_1 (Conv1D)           (None, 28, 16)            528       
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 14, 16)           0         
 1D)                                                    

<keras.callbacks.History at 0x7f740ce1ba10>

In [8]:
# encode data
testX = encode_text(tokenizer, X_test, length)
testY =  encoder.transform(y_test)

In [9]:
model.evaluate(testX, testY)



[0.9924736618995667, 0.6288922429084778]

## model evaluation

| Models | Test Accuracy %|
| --- | --- | 
| Naive Bayes | 61.65 |
| Linear SVM | 65.12 |
| Embedding + CNN | 62.89 |


LINEAR SVM model has better accuracy

In [11]:
import pickle 

with open('Models/model.pkl', 'wb') as f:
  pickle.dump(sgd, f)