In [2]:
from helpers import clean_text, create_encoder, load_dataset
from helpers import create_tokenizer, max_length, encode_text
from helpers import encode_label, define_model

In [4]:
from sklearn.model_selection import train_test_split
# load training dataset
tweets = load_dataset('/content/drive/MyDrive/sampled_tweet.pkl')
tweets['tweet'] = tweets.tweet.apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(tweets.tweet, tweets.dialect, 
                                                    test_size = 0.15, 
                                                    random_state = 42)
trainLines, trainLabels = X_train, y_train
# create tokenizer
tokenizer = create_tokenizer(trainLines)
# calculate max document length
length = max_length(trainLines)
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Max document length: %d' % length)
print('Vocabulary size: %d' % vocab_size)
# encode data
trainX = encode_text(tokenizer, trainLines, length)
print(trainX.shape)

#create label encoder
encoder = create_encoder(y_train)
# encode labels 
trainY = encode_label(encoder, y_train)


Max document length: 67
Vocabulary size: 227194
(141463, 67)


In [5]:
# define model
model, callback = define_model(length, vocab_size)
# fit model
model.fit(trainX, trainY, 
          epochs=10, batch_size=64, 
          validation_split= 0.15,
          callbacks=[callback])


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 67)]              0         
                                                                 
 embedding (Embedding)       (None, 67, 100)           22719400  
                                                                 
 conv1d (Conv1D)             (None, 64, 32)            12832     
                                                                 
 dropout (Dropout)           (None, 64, 32)            0         
                                                                 
 max_pooling1d (MaxPooling1D  (None, 32, 32)           0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 1024)              0         
                                                             

<keras.callbacks.History at 0x7f49e3c47050>

In [6]:
# encode data
testX = encode_text(tokenizer, X_test, length)
testY =  encoder.transform(y_test)

In [7]:
prediction = model.predict(testX) 

In [12]:
import numpy as np 
from sklearn.metrics import accuracy_score
pred = np.argmax(prediction, axis=1) 
y_pred = encoder.inverse_transform(pred)
print("Testing accuracy is {:.2f}%".format(accuracy_score(y_test, y_pred)*100))

Testing accuracy is 51.13%


In [13]:
model.save('CNN_model.h5')